orinium_browser/engine/html/
parser.rs1use crate::engine::html::tokenizer::{Attribute, Token, Tokenizer};
4use crate::engine::html::util as html_util;
5use crate::engine::tree::*;
6use std::cell::RefCell;
7use std::rc::Rc;
8
9#[derive(Debug, Clone)]
10pub enum HtmlNodeType {
11 Document,
12 Element {
13 tag_name: String,
14 attributes: Vec<Attribute>,
15 },
16 Text(String),
17 Comment(String),
18 Doctype {
19 name: Option<String>,
20 public_id: Option<String>,
21 system_id: Option<String>,
22 },
23 InvalidNode(Token, String), }
25
26impl HtmlNodeType {
27 pub fn tag_name(&self) -> Option<&str> {
28 match self {
29 HtmlNodeType::Element { tag_name, .. } => Some(tag_name),
30 _ => None,
31 }
32 }
33
34 pub fn get_attr(&self, name: &str) -> Option<&str> {
35 match self {
36 HtmlNodeType::Element { attributes, .. } => attributes
37 .iter()
38 .find(|attr| attr.name == name)
39 .map(|attr| attr.value.as_str()),
40 _ => None,
41 }
42 }
43 pub fn set_attr(&mut self, name: &str, value: String) {
44 if let HtmlNodeType::Element { attributes, .. } = self {
45 if let Some(attr) = attributes.iter_mut().find(|attr| attr.name == name) {
46 attr.value = value;
47 } else {
48 attributes.push(Attribute {
49 name: name.to_string(),
50 value,
51 });
52 }
53 }
54 }
55 pub fn remove_attr(&mut self, name: &str) -> Option<String> {
56 if let HtmlNodeType::Element { attributes, .. } = self {
57 attributes
58 .iter()
59 .position(|attr| attr.name == name)
60 .map(|pos| attributes.remove(pos).value)
61 } else {
62 None
63 }
64 }
65 pub fn has_attr(&self, name: &str) -> bool {
66 match self {
67 HtmlNodeType::Element { attributes, .. } => {
68 attributes.iter().any(|attr| attr.name == name)
69 }
70 _ => false,
71 }
72 }
73}
74
75pub type DomTree = Tree<HtmlNodeType>;
76
77impl DomTree {
78 pub fn get_elements_by_tag_name(&self, tag_name: &str) -> Vec<NodeRef<HtmlNodeType>> {
80 self.find_all(|n| {
81 if let HtmlNodeType::Element { tag_name: t, .. } = n {
82 t.eq_ignore_ascii_case(tag_name)
83 } else {
84 false
85 }
86 })
87 }
88
89 pub fn get_element_by_id(&self, id: &str) -> Option<NodeRef<HtmlNodeType>> {
91 self.find_all(|n| {
92 if let HtmlNodeType::Element { attributes, .. } = n {
93 attributes
94 .iter()
95 .any(|attr| attr.name == "id" && attr.value == id)
96 } else {
97 false
98 }
99 })
100 .into_iter()
101 .next()
102 }
103
104 pub fn get_elements_by_class_name(&self, class_name: &str) -> Vec<NodeRef<HtmlNodeType>> {
106 self.find_all(|n| {
107 if let HtmlNodeType::Element { attributes, .. } = n {
108 attributes.iter().any(|attr| {
109 attr.name == "class" && attr.value.split_whitespace().any(|c| c == class_name)
110 })
111 } else {
112 false
113 }
114 })
115 }
116
117 pub fn inner_text(node: &NodeRef<HtmlNodeType>) -> String {
119 let n = node.borrow();
120 match &n.value {
121 HtmlNodeType::Text(content) => content.clone(),
122 HtmlNodeType::Element { .. } => n.children().iter().map(DomTree::inner_text).collect(),
123 _ => "".to_string(),
124 }
125 }
126
127 pub fn set_text_content(node: &NodeRef<HtmlNodeType>, new_text: &str) {
129 let mut n = node.borrow_mut();
130 match &mut n.value {
131 HtmlNodeType::Text(content) => *content = new_text.to_string(),
132 HtmlNodeType::Element { .. } => {
133 n.clear_children();
135 let text_node = TreeNode::new(HtmlNodeType::Text(new_text.to_string()));
136 TreeNode::add_child(node, text_node);
137 }
138 _ => { }
139 }
140 }
141
142 pub fn collect_text_by_tag(&self, tag_name: &str) -> Vec<String> {
144 let mut texts = Vec::new();
145
146 self.traverse(|node| {
147 let n = node.borrow();
148 if let HtmlNodeType::Element { tag_name: t, .. } = &n.value
149 && t.eq_ignore_ascii_case(tag_name)
150 {
151 let text_of_this_node: String = n
152 .children()
153 .iter()
154 .filter_map(|child| {
155 let child_ref = child.borrow();
156 if let HtmlNodeType::Text(content) = &child_ref.value {
157 Some(content.clone())
158 } else {
159 None
160 }
161 })
162 .collect();
163
164 texts.push(text_of_this_node);
165 }
166 });
167
168 texts
169 }
170}
171
172pub struct Parser<'a> {
173 tokenizer: Tokenizer<'a>,
174 tree: DomTree,
175 stack: Vec<Rc<RefCell<TreeNode<HtmlNodeType>>>>,
176 tag_stack: Vec<String>,
177 special_text_mode: Option<String>, }
179
180impl<'a> Parser<'a> {
181 pub fn new(input: &'a str) -> Self {
182 let document = Tree::new(HtmlNodeType::Document);
183
184 Self {
185 tokenizer: Tokenizer::new(input),
186 tree: document.clone(),
187 stack: vec![document.root.clone()],
188 tag_stack: vec![],
189 special_text_mode: None,
190 }
191 }
192
193 pub fn parse(&mut self) -> DomTree {
194 while let Some(token) = self.tokenizer.next_token() {
195 log::debug!(target:"HtmlParser::Token" ,"Processing token: {token:?}");
196 match token {
197 Token::StartTag { .. } => self.handle_start_tag(token),
198 Token::EndTag { .. } => self.handle_end_tag(token),
199 Token::Doctype { .. } => self.handle_doctype(token),
200 Token::Comment(_) => self.handle_comment(token),
201 Token::Text(_) => self.handle_text(token),
202 }
203 }
204 self.autofill_elements();
205
206 self.tree.clone()
207 }
208
209 fn handle_start_tag(&mut self, token: Token) {
210 if let Token::StartTag {
211 name,
212 attributes,
213 self_closing,
214 } = token
215 {
216 let mut parent = Rc::clone(self.stack.last().unwrap());
217 if self.special_text_mode.is_some() {
218 TreeNode::add_child_value(&parent, HtmlNodeType::Text(format!("<{}>", name)));
221 return;
222 }
223
224 while self.check_start_tag_with_invalid_nesting(&name, &parent) {
225 if let HtmlNodeType::Element { tag_name, .. } = &parent.borrow().value {
226 log::info!(target:"HtmlParser::AutoClosing" ,"Auto-closing tag: <{}> to allow <{}> inside it.", tag_name, name);
227 self.handle_end_tag(Token::EndTag {
228 name: tag_name.clone(),
229 });
230 }
231 parent = Rc::clone(self.stack.last().unwrap());
232 }
233
234 let new_node = TreeNode::add_child_value(
235 &parent,
236 HtmlNodeType::Element {
237 tag_name: name.clone(),
238 attributes: attributes.clone(),
239 },
240 );
241
242 if name == "script" || name == "style" {
244 self.special_text_mode = Some(name.clone());
245 }
246
247 if !self_closing {
249 self.tag_stack.push(name.clone());
250 self.stack.push(new_node);
251 log::debug!(target:"HtmlParser::Stack" ,"Stack len: {}, +Pushed <{}> to stack.", self.stack.len(), name);
252 }
253 }
254 }
255
256 fn handle_end_tag(&mut self, token: Token) {
257 if let Token::EndTag { ref name } = token {
258 if self.special_text_mode.as_deref() == Some(name.as_str()) {
260 self.special_text_mode = None;
261 }
262
263 if self.special_text_mode.is_some() {
264 let parent = Rc::clone(self.stack.last().unwrap());
265 TreeNode::add_child_value(&parent, HtmlNodeType::Text(format!("</{}>", name)));
266 return;
267 }
268
269 let name = name.clone();
270 if self.tag_stack.contains(&name) {
271 while let Some(top) = self.stack.pop() {
272 if let HtmlNodeType::Element { tag_name, .. } = &top.borrow().value {
273 self.tag_stack.pop();
274 if tag_name == &name {
275 log::debug!(target:"HtmlParser::Stack" ,"Stack len: {}, -Popped </{}> from stack.", self.stack.len(), name);
276 break;
277 } else {
278 log::debug!(target:"HtmlParser::Stack" ,"Stack len: {}, Unmatched end tag: </{}>, Find <{}>", self.stack.len(), name, tag_name);
279 }
280 }
281 }
282 } else {
283 let parent = Rc::clone(self.stack.last().unwrap());
284 TreeNode::add_child_value(
285 &parent,
286 HtmlNodeType::InvalidNode(
287 token,
288 format!("No matching start tag for </{}>", name),
289 ),
290 );
291 log::debug!(target:"HtmlParser::Invalid" ,"Invalid end tag: </{}>", name);
292 }
293 }
294 }
295
296 fn handle_text(&mut self, token: Token) {
297 if let Token::Text(data) = token {
298 let parent = Rc::clone(self.stack.last().unwrap());
299
300 if self.special_text_mode.is_some() {
302 TreeNode::add_child_value(&parent, HtmlNodeType::Text(data));
303 return;
304 }
305
306 if let Some(parent_node) = parent.borrow().parent() {
308 let parent_node_borrow = parent_node.borrow();
309 if let HtmlNodeType::Element { tag_name, .. } = &parent_node_borrow.value {
310 if !matches!(tag_name.as_str(), "pre" | "textarea" | "script" | "style")
311 && data.trim().is_empty()
312 {
313 return;
314 }
315 } else if data.trim().is_empty() {
316 return;
317 }
318 } else if data.trim().is_empty() {
319 return;
320 }
321 TreeNode::add_child_value(&parent, HtmlNodeType::Text(data));
322 }
323 }
324
325 fn handle_comment(&mut self, token: Token) {
326 if let Token::Comment(data) = token {
327 let parent = Rc::clone(self.stack.last().unwrap());
328 TreeNode::add_child_value(&parent, HtmlNodeType::Comment(data));
329 }
330 }
331
332 fn handle_doctype(&mut self, token: Token) {
333 if let Token::Doctype {
334 name,
335 public_id,
336 system_id,
337 ..
338 } = token
339 {
340 let parent = Rc::clone(self.stack.last().unwrap());
341 TreeNode::add_child_value(
342 &parent,
343 HtmlNodeType::Doctype {
344 name,
345 public_id,
346 system_id,
347 },
348 );
349 }
350 }
351
352 fn check_start_tag_with_invalid_nesting(
353 &self,
354 name: &String,
355 parent: &Rc<RefCell<TreeNode<HtmlNodeType>>>,
356 ) -> bool {
357 if let HtmlNodeType::Element { tag_name, .. } = &parent.borrow().value {
358 if tag_name != "html" && name == "body" {
360 println!("here we can see 「お行儀の悪いコード」");
361 return true;
362 }
363 if tag_name == "p" && name == "p" {
365 return true;
366 }
367 if tag_name == "li" && name == "li" {
369 return true;
370 }
371 if tag_name == "a" && name == "a" {
373 return true;
374 }
375 if tag_name == "dt" && (name == "dt" || name == "dd") {
377 return true;
378 }
379 if tag_name == "dd" && (name == "dt" || name == "dd") {
381 return true;
382 }
383 if tag_name == "option" && name == "option" {
385 return true;
386 }
387 if matches!(
389 tag_name.as_str(),
390 "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6"
391 ) && html_util::is_block_level_element(name)
392 {
393 return true;
394 }
395 }
396 false
397 }
398
399 fn autofill_elements(&mut self) {
401 let root = Rc::clone(&self.stack[0]);
402 let mut has_doctype = false;
403 let mut has_html = false;
404 let mut has_head = false;
405 let mut has_body = false;
406
407 for child in root.borrow().children() {
408 match &child.borrow().value {
409 HtmlNodeType::Doctype { .. } => has_doctype = true,
410 HtmlNodeType::Element { tag_name, .. } if tag_name.to_lowercase() == "html" => {
411 has_html = true;
412 for html_child in child.borrow().children() {
413 match &html_child.borrow().value {
414 HtmlNodeType::Element { tag_name, .. }
415 if tag_name.to_lowercase() == "head" =>
416 {
417 has_head = true;
418 }
419 HtmlNodeType::Element { tag_name, .. }
420 if tag_name.to_lowercase() == "body" =>
421 {
422 has_body = true;
423 }
424 _ => {}
425 }
426 }
427 }
428 _ => {}
429 }
430 }
431
432 if !has_doctype {
433 let doctype_node = TreeNode::new(HtmlNodeType::Doctype {
434 name: Some("html".to_string()),
435 public_id: None,
436 system_id: None,
437 });
438 TreeNode::insert_child_at(&root, 0, Rc::clone(&doctype_node));
439 }
440
441 if !has_html {
442 let html_node = TreeNode::new(HtmlNodeType::Element {
443 tag_name: "html".to_string(),
444 attributes: vec![],
445 });
446 TreeNode::add_child(&root, Rc::clone(&html_node));
447
448 if !has_head {
449 TreeNode::add_child_value(
450 &html_node,
451 HtmlNodeType::Element {
452 tag_name: "head".to_string(),
453 attributes: vec![],
454 },
455 );
456 }
457
458 if !has_body {
459 TreeNode::add_child_value(
460 &html_node,
461 HtmlNodeType::Element {
462 tag_name: "body".to_string(),
463 attributes: vec![],
464 },
465 );
466 }
467 }
468 }
469}