orinium_browser/engine/html/
parser.rs

1//! HTMLパーサー。トークンストリームをDOMツリーに変換する。
2
3use crate::engine::html::tokenizer::{Attribute, Token, Tokenizer};
4use crate::engine::html::util as html_util;
5use crate::engine::tree::*;
6use std::cell::RefCell;
7use std::rc::Rc;
8
9#[derive(Debug, Clone)]
10pub enum HtmlNodeType {
11    Document,
12    Element {
13        tag_name: String,
14        attributes: Vec<Attribute>,
15    },
16    Text(String),
17    Comment(String),
18    Doctype {
19        name: Option<String>,
20        public_id: Option<String>,
21        system_id: Option<String>,
22    },
23    InvalidNode(Token, String), // 不正なトークン用
24}
25
26impl HtmlNodeType {
27    pub fn tag_name(&self) -> Option<&str> {
28        match self {
29            HtmlNodeType::Element { tag_name, .. } => Some(tag_name),
30            _ => None,
31        }
32    }
33
34    pub fn get_attr(&self, name: &str) -> Option<&str> {
35        match self {
36            HtmlNodeType::Element { attributes, .. } => attributes
37                .iter()
38                .find(|attr| attr.name == name)
39                .map(|attr| attr.value.as_str()),
40            _ => None,
41        }
42    }
43    pub fn set_attr(&mut self, name: &str, value: String) {
44        if let HtmlNodeType::Element { attributes, .. } = self {
45            if let Some(attr) = attributes.iter_mut().find(|attr| attr.name == name) {
46                attr.value = value;
47            } else {
48                attributes.push(Attribute {
49                    name: name.to_string(),
50                    value,
51                });
52            }
53        }
54    }
55    pub fn remove_attr(&mut self, name: &str) -> Option<String> {
56        if let HtmlNodeType::Element { attributes, .. } = self {
57            attributes
58                .iter()
59                .position(|attr| attr.name == name)
60                .map(|pos| attributes.remove(pos).value)
61        } else {
62            None
63        }
64    }
65    pub fn has_attr(&self, name: &str) -> bool {
66        match self {
67            HtmlNodeType::Element { attributes, .. } => {
68                attributes.iter().any(|attr| attr.name == name)
69            }
70            _ => false,
71        }
72    }
73}
74
75pub type DomTree = Tree<HtmlNodeType>;
76
77impl DomTree {
78    /// Returns all elements with the given tag name
79    pub fn get_elements_by_tag_name(&self, tag_name: &str) -> Vec<NodeRef<HtmlNodeType>> {
80        self.find_all(|n| {
81            if let HtmlNodeType::Element { tag_name: t, .. } = n {
82                t.eq_ignore_ascii_case(tag_name)
83            } else {
84                false
85            }
86        })
87    }
88
89    /// Returns the element with the given id
90    pub fn get_element_by_id(&self, id: &str) -> Option<NodeRef<HtmlNodeType>> {
91        self.find_all(|n| {
92            if let HtmlNodeType::Element { attributes, .. } = n {
93                attributes
94                    .iter()
95                    .any(|attr| attr.name == "id" && attr.value == id)
96            } else {
97                false
98            }
99        })
100        .into_iter()
101        .next()
102    }
103
104    /// Returns all elements that have the given class
105    pub fn get_elements_by_class_name(&self, class_name: &str) -> Vec<NodeRef<HtmlNodeType>> {
106        self.find_all(|n| {
107            if let HtmlNodeType::Element { attributes, .. } = n {
108                attributes.iter().any(|attr| {
109                    attr.name == "class" && attr.value.split_whitespace().any(|c| c == class_name)
110                })
111            } else {
112                false
113            }
114        })
115    }
116
117    /// Returns the concatenated text content of this node (including children)
118    pub fn inner_text(node: &NodeRef<HtmlNodeType>) -> String {
119        let n = node.borrow();
120        match &n.value {
121            HtmlNodeType::Text(content) => content.clone(),
122            HtmlNodeType::Element { .. } => n.children().iter().map(DomTree::inner_text).collect(),
123            _ => "".to_string(),
124        }
125    }
126
127    /// Replace all text content of this node with the given string
128    pub fn set_text_content(node: &NodeRef<HtmlNodeType>, new_text: &str) {
129        let mut n = node.borrow_mut();
130        match &mut n.value {
131            HtmlNodeType::Text(content) => *content = new_text.to_string(),
132            HtmlNodeType::Element { .. } => {
133                // remove all children and add a single Text node
134                n.clear_children();
135                let text_node = TreeNode::new(HtmlNodeType::Text(new_text.to_string()));
136                TreeNode::add_child(node, text_node);
137            }
138            _ => { /* do nothing */ }
139        }
140    }
141
142    /// 指定したタグ名の要素のテキストノードをすべて集める
143    pub fn collect_text_by_tag(&self, tag_name: &str) -> Vec<String> {
144        let mut texts = Vec::new();
145
146        self.traverse(|node| {
147            let n = node.borrow();
148            if let HtmlNodeType::Element { tag_name: t, .. } = &n.value
149                && t.eq_ignore_ascii_case(tag_name)
150            {
151                let text_of_this_node: String = n
152                    .children()
153                    .iter()
154                    .filter_map(|child| {
155                        let child_ref = child.borrow();
156                        if let HtmlNodeType::Text(content) = &child_ref.value {
157                            Some(content.clone())
158                        } else {
159                            None
160                        }
161                    })
162                    .collect();
163
164                texts.push(text_of_this_node);
165            }
166        });
167
168        texts
169    }
170}
171
172pub struct Parser<'a> {
173    tokenizer: Tokenizer<'a>,
174    tree: DomTree,
175    stack: Vec<Rc<RefCell<TreeNode<HtmlNodeType>>>>,
176    tag_stack: Vec<String>,
177    special_text_mode: Option<String>, // script/style 用
178}
179
180impl<'a> Parser<'a> {
181    pub fn new(input: &'a str) -> Self {
182        let document = Tree::new(HtmlNodeType::Document);
183
184        Self {
185            tokenizer: Tokenizer::new(input),
186            tree: document.clone(),
187            stack: vec![document.root.clone()],
188            tag_stack: vec![],
189            special_text_mode: None,
190        }
191    }
192
193    pub fn parse(&mut self) -> DomTree {
194        while let Some(token) = self.tokenizer.next_token() {
195            log::debug!(target:"HtmlParser::Token" ,"Processing token: {token:?}");
196            match token {
197                Token::StartTag { .. } => self.handle_start_tag(token),
198                Token::EndTag { .. } => self.handle_end_tag(token),
199                Token::Doctype { .. } => self.handle_doctype(token),
200                Token::Comment(_) => self.handle_comment(token),
201                Token::Text(_) => self.handle_text(token),
202            }
203        }
204        self.autofill_elements();
205
206        self.tree.clone()
207    }
208
209    fn handle_start_tag(&mut self, token: Token) {
210        if let Token::StartTag {
211            name,
212            attributes,
213            self_closing,
214        } = token
215        {
216            let mut parent = Rc::clone(self.stack.last().unwrap());
217            if self.special_text_mode.is_some() {
218                // TODO:
219                // attributes, self_closing
220                TreeNode::add_child_value(&parent, HtmlNodeType::Text(format!("<{}>", name)));
221                return;
222            }
223
224            while self.check_start_tag_with_invalid_nesting(&name, &parent) {
225                if let HtmlNodeType::Element { tag_name, .. } = &parent.borrow().value {
226                    log::info!(target:"HtmlParser::AutoClosing" ,"Auto-closing tag: <{}> to allow <{}> inside it.", tag_name, name);
227                    self.handle_end_tag(Token::EndTag {
228                        name: tag_name.clone(),
229                    });
230                }
231                parent = Rc::clone(self.stack.last().unwrap());
232            }
233
234            let new_node = TreeNode::add_child_value(
235                &parent,
236                HtmlNodeType::Element {
237                    tag_name: name.clone(),
238                    attributes: attributes.clone(),
239                },
240            );
241
242            // script/style は special mode に
243            if name == "script" || name == "style" {
244                self.special_text_mode = Some(name.clone());
245            }
246
247            // Self-closing タグは stack に push しない
248            if !self_closing {
249                self.tag_stack.push(name.clone());
250                self.stack.push(new_node);
251                log::debug!(target:"HtmlParser::Stack" ,"Stack len: {}, +Pushed <{}> to stack.", self.stack.len(), name);
252            }
253        }
254    }
255
256    fn handle_end_tag(&mut self, token: Token) {
257        if let Token::EndTag { ref name } = token {
258            // special mode を解除
259            if self.special_text_mode.as_deref() == Some(name.as_str()) {
260                self.special_text_mode = None;
261            }
262
263            if self.special_text_mode.is_some() {
264                let parent = Rc::clone(self.stack.last().unwrap());
265                TreeNode::add_child_value(&parent, HtmlNodeType::Text(format!("</{}>", name)));
266                return;
267            }
268
269            let name = name.clone();
270            if self.tag_stack.contains(&name) {
271                while let Some(top) = self.stack.pop() {
272                    if let HtmlNodeType::Element { tag_name, .. } = &top.borrow().value {
273                        self.tag_stack.pop();
274                        if tag_name == &name {
275                            log::debug!(target:"HtmlParser::Stack" ,"Stack len: {}, -Popped </{}> from stack.", self.stack.len(), name);
276                            break;
277                        } else {
278                            log::debug!(target:"HtmlParser::Stack" ,"Stack len: {}, Unmatched end tag: </{}>, Find <{}>", self.stack.len(), name, tag_name);
279                        }
280                    }
281                }
282            } else {
283                let parent = Rc::clone(self.stack.last().unwrap());
284                TreeNode::add_child_value(
285                    &parent,
286                    HtmlNodeType::InvalidNode(
287                        token,
288                        format!("No matching start tag for </{}>", name),
289                    ),
290                );
291                log::debug!(target:"HtmlParser::Invalid" ,"Invalid end tag: </{}>", name);
292            }
293        }
294    }
295
296    fn handle_text(&mut self, token: Token) {
297        if let Token::Text(data) = token {
298            let parent = Rc::clone(self.stack.last().unwrap());
299
300            // special mode 中はそのままテキスト追加
301            if self.special_text_mode.is_some() {
302                TreeNode::add_child_value(&parent, HtmlNodeType::Text(data));
303                return;
304            }
305
306            // 親ノードが pre, textarea, script, style でない場合、空白改行を無視する
307            if let Some(parent_node) = parent.borrow().parent() {
308                let parent_node_borrow = parent_node.borrow();
309                if let HtmlNodeType::Element { tag_name, .. } = &parent_node_borrow.value {
310                    if !matches!(tag_name.as_str(), "pre" | "textarea" | "script" | "style")
311                        && data.trim().is_empty()
312                    {
313                        return;
314                    }
315                } else if data.trim().is_empty() {
316                    return;
317                }
318            } else if data.trim().is_empty() {
319                return;
320            }
321            TreeNode::add_child_value(&parent, HtmlNodeType::Text(data));
322        }
323    }
324
325    fn handle_comment(&mut self, token: Token) {
326        if let Token::Comment(data) = token {
327            let parent = Rc::clone(self.stack.last().unwrap());
328            TreeNode::add_child_value(&parent, HtmlNodeType::Comment(data));
329        }
330    }
331
332    fn handle_doctype(&mut self, token: Token) {
333        if let Token::Doctype {
334            name,
335            public_id,
336            system_id,
337            ..
338        } = token
339        {
340            let parent = Rc::clone(self.stack.last().unwrap());
341            TreeNode::add_child_value(
342                &parent,
343                HtmlNodeType::Doctype {
344                    name,
345                    public_id,
346                    system_id,
347                },
348            );
349        }
350    }
351
352    fn check_start_tag_with_invalid_nesting(
353        &self,
354        name: &String,
355        parent: &Rc<RefCell<TreeNode<HtmlNodeType>>>,
356    ) -> bool {
357        if let HtmlNodeType::Element { tag_name, .. } = &parent.borrow().value {
358            // <html> 以外の中に <body> が来た場合、そのタグを閉じる
359            if tag_name != "html" && name == "body" {
360                println!("here we can see 「お行儀の悪いコード」");
361                return true;
362            }
363            // <p> の中に <p> が来た場合、前の <p> を閉じる
364            if tag_name == "p" && name == "p" {
365                return true;
366            }
367            // <li> の中に <li> が来た場合、前の <li> を閉じる
368            if tag_name == "li" && name == "li" {
369                return true;
370            }
371            // <a> の中に <a> が来た場合、前の <a> を閉じる
372            if tag_name == "a" && name == "a" {
373                return true;
374            }
375            // <dt> の中に <dt> または <dd> が来た場合、前の <dt> を閉じる
376            if tag_name == "dt" && (name == "dt" || name == "dd") {
377                return true;
378            }
379            // <dd> の中に <dt> または <dd> が来た場合、前の <dd> を閉じる
380            if tag_name == "dd" && (name == "dt" || name == "dd") {
381                return true;
382            }
383            // <option> の中に <option> が来た場合、前の <option> を閉じる
384            if tag_name == "option" && name == "option" {
385                return true;
386            }
387            // <p> の中にブロック要素が来た場合、前の <p> を閉じる
388            if matches!(
389                tag_name.as_str(),
390                "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6"
391            ) && html_util::is_block_level_element(name)
392            {
393                return true;
394            }
395        }
396        false
397    }
398
399    /// DOCTYPE宣言、html, head, body 要素が存在しない場合に補完する
400    fn autofill_elements(&mut self) {
401        let root = Rc::clone(&self.stack[0]);
402        let mut has_doctype = false;
403        let mut has_html = false;
404        let mut has_head = false;
405        let mut has_body = false;
406
407        for child in root.borrow().children() {
408            match &child.borrow().value {
409                HtmlNodeType::Doctype { .. } => has_doctype = true,
410                HtmlNodeType::Element { tag_name, .. } if tag_name.to_lowercase() == "html" => {
411                    has_html = true;
412                    for html_child in child.borrow().children() {
413                        match &html_child.borrow().value {
414                            HtmlNodeType::Element { tag_name, .. }
415                                if tag_name.to_lowercase() == "head" =>
416                            {
417                                has_head = true;
418                            }
419                            HtmlNodeType::Element { tag_name, .. }
420                                if tag_name.to_lowercase() == "body" =>
421                            {
422                                has_body = true;
423                            }
424                            _ => {}
425                        }
426                    }
427                }
428                _ => {}
429            }
430        }
431
432        if !has_doctype {
433            let doctype_node = TreeNode::new(HtmlNodeType::Doctype {
434                name: Some("html".to_string()),
435                public_id: None,
436                system_id: None,
437            });
438            TreeNode::insert_child_at(&root, 0, Rc::clone(&doctype_node));
439        }
440
441        if !has_html {
442            let html_node = TreeNode::new(HtmlNodeType::Element {
443                tag_name: "html".to_string(),
444                attributes: vec![],
445            });
446            TreeNode::add_child(&root, Rc::clone(&html_node));
447
448            if !has_head {
449                TreeNode::add_child_value(
450                    &html_node,
451                    HtmlNodeType::Element {
452                        tag_name: "head".to_string(),
453                        attributes: vec![],
454                    },
455                );
456            }
457
458            if !has_body {
459                TreeNode::add_child_value(
460                    &html_node,
461                    HtmlNodeType::Element {
462                        tag_name: "body".to_string(),
463                        attributes: vec![],
464                    },
465                );
466            }
467        }
468    }
469}