orinium_browser/engine/html/
tokenizer.rs

1//! HTML tokenizer. Converts raw HTML input into token stream.
2
3use super::util::decode_entity;
4
5/// Represents a single HTML attribute
6#[derive(Debug, Clone, PartialEq)]
7pub struct Attribute {
8    pub name: String,
9    pub value: String,
10}
11
12/// HTML tokens emitted by the tokenizer
13#[derive(Debug, Clone, PartialEq)]
14pub enum Token {
15    Doctype {
16        name: Option<String>,
17        public_id: Option<String>,
18        system_id: Option<String>,
19        force_quirks: bool,
20    },
21    StartTag {
22        name: String,
23        attributes: Vec<Attribute>,
24        self_closing: bool,
25    },
26    EndTag {
27        name: String,
28    },
29    Comment(String),
30    Text(String),
31}
32
33/// Represents the internal state of the tokenizer
34#[derive(Debug, PartialEq)]
35pub enum TokenizerState {
36    Data,
37    ScriptData,
38    StyleData,
39    EscapeDecoding,
40    TagOpen,
41    EndTagOpen,
42    TagName,
43    BeforeAttributeName,
44    AttributeName,
45    AfterAttributeName,
46    BeforeAttributeValue,
47    AttributeValueDoubleQuoted,
48    AttributeValueSingleQuoted,
49    AttributeValueUnquoted,
50    SelfClosingStartTag,
51    CommentStartDash,
52    Comment,
53    CommentEndDash,
54    CommentEnd,
55    BogusComment,
56    Doctype,
57    DoctypeName,
58    BeforeDoctypePublicId,
59    DoctypePublicIdWithSingleQuote,
60    DoctypePublicIdWithDoubleQuote,
61    AfterDoctypePublicId,
62    DoctypeSystemId,
63    BogusDoctype,
64}
65
66impl TokenizerState {
67    /// Returns true if the current state is a doctype-related state
68    fn is_doctype(&self) -> bool {
69        matches!(
70            self,
71            TokenizerState::Doctype
72                | TokenizerState::DoctypeName
73                | TokenizerState::BeforeDoctypePublicId
74                | TokenizerState::DoctypePublicIdWithSingleQuote
75                | TokenizerState::DoctypePublicIdWithDoubleQuote
76                | TokenizerState::AfterDoctypePublicId
77                | TokenizerState::DoctypeSystemId
78                | TokenizerState::BogusDoctype
79        )
80    }
81
82    /// Returns true if the current state is a comment-related state
83    fn is_comment(&self) -> bool {
84        matches!(
85            self,
86            TokenizerState::Comment
87                | TokenizerState::CommentStartDash
88                | TokenizerState::CommentEndDash
89                | TokenizerState::CommentEnd
90                | TokenizerState::BogusComment
91        )
92    }
93}
94
95/// HTML tokenizer implementation
96pub struct Tokenizer<'a> {
97    input: &'a str,
98    pos: usize,
99    token: Option<Token>,
100    state: TokenizerState,
101    current_token: Option<Token>,
102    current_attribute: Option<Attribute>,
103    buffer: String,
104}
105
106impl<'a> Tokenizer<'a> {
107    /// Creates a new tokenizer for the given input
108    pub fn new(input: &'a str) -> Self {
109        Self {
110            input,
111            pos: 0,
112            token: None,
113            state: TokenizerState::Data,
114            current_token: None,
115            current_attribute: None,
116            buffer: String::new(),
117        }
118    }
119
120    /// Returns the next character from input and advances the position
121    fn next_char(&mut self) -> Option<char> {
122        if self.pos >= self.input.len() {
123            None
124        } else {
125            let c = self.input[self.pos..].chars().next().unwrap();
126            self.pos += c.len_utf8();
127            Some(c)
128        }
129    }
130
131    /// Emits the current token and clears the buffer
132    fn commit_token(&mut self) {
133        self.token = self.current_token.take();
134        self.buffer.clear();
135    }
136
137    /// Pushes the current attribute to the start tag if exists
138    fn push_current_attribute(&mut self) {
139        if let (Some(attr), Some(Token::StartTag { attributes, .. })) =
140            (self.current_attribute.take(), &mut self.current_token)
141        {
142            attributes.push(attr);
143        }
144    }
145
146    fn handle_special_tag_state_transition(&mut self, token: &Token) {
147        if let Token::StartTag { name, .. } = token {
148            // Switch to ScriptData or StyleData state if we encounter <script> or <style> start tags
149            match name.to_lowercase().as_str() {
150                "script" => self.state = TokenizerState::ScriptData,
151                "style" => self.state = TokenizerState::StyleData,
152                _ => self.state = TokenizerState::Data,
153            }
154        } else {
155            // Do nothing. We only switch to ScriptData or StyleData on StartTag, and we return to Data on EndTag.
156        }
157    }
158
159    /// Debug log for emitted tokens
160    #[inline(always)]
161    fn debug_emit(&self, token: &Token) {
162        #[cfg(debug_assertions)]
163        match token {
164            Token::StartTag { name, .. } => {
165                log::debug!(target:"HtmlTokenizer::EmitToken::TagStart", "Emitting token: {name}, Pos: {}", self.pos)
166            }
167            Token::EndTag { name } => {
168                log::debug!(target:"HtmlTokenizer::EmitToken::TagEnd", "Emitting token: {name}, Pos: {}", self.pos)
169            }
170            Token::Comment(comment) => {
171                log::debug!(target:"HtmlTokenizer::EmitToken::Comment", "Emitting token: {}, Pos: {}", comment, self.pos)
172            }
173            Token::Text(text) => {
174                log::debug!(target:"HtmlTokenizer::EmitToken::Text", "Emitting token: `{text}`, Pos: {}", self.pos)
175            }
176            _ => {}
177        }
178    }
179
180    /// Returns the next token if available
181    pub fn next_token(&mut self) -> Option<Token> {
182        while let Some(c) = self.next_char() {
183            log::debug!(target:"HtmlTokenizer::Char", "State: {:?}, Char: '{}'", self.state, c);
184
185            match self.state {
186                TokenizerState::Data | TokenizerState::StyleData | TokenizerState::ScriptData => {
187                    self.state_data(c)
188                }
189                TokenizerState::EscapeDecoding => self.state_escape_decoding(c),
190                _ if self.state.is_doctype() => self.state_doctype(c),
191                TokenizerState::TagOpen => self.state_tag_open(c),
192                TokenizerState::TagName => self.state_tag_name(c),
193                TokenizerState::BeforeAttributeName => self.state_before_attribute_name(c),
194                TokenizerState::AttributeName => self.state_attribute_name(c),
195                TokenizerState::BeforeAttributeValue => self.state_before_attribute_value(c),
196                TokenizerState::AttributeValueDoubleQuoted
197                | TokenizerState::AttributeValueSingleQuoted => {
198                    self.state_attribute_value_quoted(c)
199                }
200                TokenizerState::AfterAttributeName => self.state_after_attribute_name(c),
201                TokenizerState::AttributeValueUnquoted => self.state_attribute_value_unquoted(c),
202                TokenizerState::SelfClosingStartTag => self.state_self_closing_start_tag(c),
203                TokenizerState::EndTagOpen => self.state_end_tag_open(c),
204                _ if self.state.is_comment() => self.state_comment(c),
205                _ => {
206                    log::error!(target:"HtmlTokenizer::State", "Unimplemented state: {:?}, returning to Data state", self.state);
207                    self.state = TokenizerState::Data;
208                }
209            }
210
211            if let Some(token) = self.token.take() {
212                self.debug_emit(&token);
213                self.handle_special_tag_state_transition(&token);
214                return Some(token);
215            }
216        }
217
218        // End of input: commit remaining current_token if exists
219        if self.current_token.is_some() {
220            self.commit_token();
221            return self.token.take();
222        }
223
224        // Emit BogusComment if input ended while in comment
225        if self.state.is_comment() {
226            self.state = TokenizerState::BogusComment;
227            self.commit_token();
228            return self.token.take();
229        }
230
231        None
232    }
233
234    // --- State handlers ---
235    fn state_data(&mut self, c: char) {
236        match c {
237            '<' => {
238                self.commit_token();
239                self.state = TokenizerState::TagOpen;
240            }
241            // Handle escape entities only in Data. Not in ScriptData, and StyleData states.
242            '&' if self.state == TokenizerState::Data => {
243                self.buffer.push('&');
244                self.state = TokenizerState::EscapeDecoding;
245            }
246            _ => {
247                self.buffer.push(c);
248                match &mut self.current_token {
249                    Some(Token::Text(text)) => text.push(c),
250                    _ => self.current_token = Some(Token::Text(c.to_string())),
251                }
252            }
253        }
254    }
255
256    fn state_escape_decoding(&mut self, c: char) {
257        if c == ';' {
258            let mut iter = self.buffer.rsplitn(2, '&');
259            let entity = iter.next().unwrap_or("");
260
261            let decoded = decode_entity(entity).unwrap_or_else(|| format!("&{};", entity));
262
263            match &mut self.current_token {
264                Some(Token::Text(text)) => text.push_str(&decoded),
265                _ => self.current_token = Some(Token::Text(decoded)),
266            }
267
268            self.buffer.clear();
269            self.state = TokenizerState::Data;
270        } else {
271            self.buffer.push(c);
272        }
273    }
274
275    fn state_tag_open(&mut self, c: char) {
276        match c {
277            '/' => self.state = TokenizerState::EndTagOpen,
278            '!' => {
279                if self.input[self.pos..].starts_with('-') {
280                    self.pos += 1;
281                    self.state = TokenizerState::CommentStartDash;
282                } else if self.input[self.pos..].to_lowercase().starts_with("doctype") {
283                    self.pos += 7;
284                    self.state = TokenizerState::Doctype;
285                    self.current_token = Some(Token::Doctype {
286                        name: None,
287                        public_id: None,
288                        system_id: None,
289                        force_quirks: false,
290                    });
291                } else {
292                    self.state = TokenizerState::BogusComment;
293                }
294            }
295            c if c.is_ascii_alphabetic() => {
296                self.state = TokenizerState::TagName;
297                self.buffer.push(c);
298                self.current_token = Some(Token::StartTag {
299                    name: c.to_string(),
300                    attributes: Vec::new(),
301                    self_closing: false,
302                });
303            }
304            _ => {
305                self.buffer.push('<');
306                self.buffer.push(c);
307                match &mut self.current_token {
308                    Some(Token::Text(text)) => {
309                        text.push('<');
310                        text.push(c);
311                    }
312                    _ => self.current_token = Some(Token::Text(format!("<{c}"))),
313                }
314                self.state = TokenizerState::Data;
315            }
316        }
317    }
318
319    fn state_tag_name(&mut self, c: char) {
320        match c {
321            c if c.is_whitespace() => self.state = TokenizerState::BeforeAttributeName,
322            '/' => self.state = TokenizerState::SelfClosingStartTag,
323            '>' => {
324                self.commit_token();
325                self.state = TokenizerState::Data;
326            }
327            c if c.is_ascii_alphanumeric() || c == '-' || c == '_' || c == ':' => {
328                self.buffer.push(c);
329                match &mut self.current_token {
330                    Some(Token::StartTag { name, .. }) => name.push(c),
331                    Some(Token::EndTag { name }) => name.push(c),
332                    _ => {}
333                }
334            }
335            _ => {
336                self.commit_token();
337                self.state = TokenizerState::Data;
338            }
339        }
340    }
341
342    fn state_before_attribute_name(&mut self, c: char) {
343        match c {
344            c if c.is_whitespace() => {}
345            '/' => self.state = TokenizerState::SelfClosingStartTag,
346            '>' => {
347                self.commit_token();
348                self.state = TokenizerState::Data;
349            }
350            c if c.is_ascii_alphanumeric() => {
351                self.state = TokenizerState::AttributeName;
352                self.buffer.push(c);
353                self.current_attribute = Some(Attribute {
354                    name: c.to_string(),
355                    value: String::new(),
356                });
357            }
358            _ => {}
359        }
360    }
361
362    fn state_attribute_name(&mut self, c: char) {
363        match c {
364            c if c.is_whitespace() => self.state = TokenizerState::AfterAttributeName,
365            '=' => self.state = TokenizerState::BeforeAttributeValue,
366            '/' => {
367                self.push_current_attribute();
368                self.state = TokenizerState::SelfClosingStartTag;
369            }
370            '>' => {
371                self.push_current_attribute();
372                self.commit_token();
373                self.state = TokenizerState::Data;
374            }
375            c if c.is_ascii_alphanumeric() || c == '-' || c == '_' || c == ':' => {
376                self.buffer.push(c);
377                if let Some(attr) = &mut self.current_attribute {
378                    attr.name.push(c);
379                }
380            }
381            _ => {}
382        }
383    }
384
385    fn state_before_attribute_value(&mut self, c: char) {
386        match c {
387            c if c.is_whitespace() => {}
388            '"' => self.state = TokenizerState::AttributeValueDoubleQuoted,
389            '\'' => self.state = TokenizerState::AttributeValueSingleQuoted,
390            '>' => {
391                self.push_current_attribute();
392                self.commit_token();
393                self.state = TokenizerState::Data;
394            }
395            _ => {
396                self.state = TokenizerState::AttributeValueUnquoted;
397                if let Some(attr) = &mut self.current_attribute {
398                    attr.value.push(c);
399                }
400            }
401        }
402    }
403
404    fn state_attribute_value_quoted(&mut self, c: char) {
405        match (&self.state, c) {
406            (&TokenizerState::AttributeValueDoubleQuoted, '"')
407            | (&TokenizerState::AttributeValueSingleQuoted, '\'') => {
408                self.push_current_attribute();
409                self.state = TokenizerState::AfterAttributeName;
410            }
411            _ => {
412                if let Some(attr) = &mut self.current_attribute {
413                    attr.value.push(c);
414                }
415            }
416        }
417    }
418
419    fn state_after_attribute_name(&mut self, c: char) {
420        match c {
421            c if c.is_whitespace() => {}
422            '/' => self.state = TokenizerState::SelfClosingStartTag,
423            '>' => {
424                self.commit_token();
425                self.state = TokenizerState::Data;
426            }
427            c if c.is_ascii_alphanumeric() => {
428                self.state = TokenizerState::AttributeName;
429                self.buffer.push(c);
430                self.current_attribute = Some(Attribute {
431                    name: c.to_string(),
432                    value: String::new(),
433                });
434            }
435            _ => {}
436        }
437    }
438
439    fn state_attribute_value_unquoted(&mut self, c: char) {
440        match c {
441            c if c.is_whitespace() => {
442                self.push_current_attribute();
443                self.state = TokenizerState::BeforeAttributeName;
444            }
445            '>' => {
446                self.push_current_attribute();
447                self.commit_token();
448                self.state = TokenizerState::Data;
449            }
450            _ => {
451                if let Some(attr) = &mut self.current_attribute {
452                    attr.value.push(c);
453                }
454            }
455        }
456    }
457
458    fn state_self_closing_start_tag(&mut self, c: char) {
459        match c {
460            '>' => {
461                if let Some(Token::StartTag { self_closing, .. }) = &mut self.current_token {
462                    *self_closing = true;
463                }
464                self.commit_token();
465                self.state = TokenizerState::Data;
466            }
467            _ => self.state = TokenizerState::Data,
468        }
469    }
470
471    fn state_end_tag_open(&mut self, c: char) {
472        match c {
473            c if c.is_ascii_alphabetic() => {
474                self.state = TokenizerState::TagName;
475                self.buffer.push(c);
476                self.current_token = Some(Token::EndTag {
477                    name: c.to_string(),
478                });
479            }
480            _ => self.state = TokenizerState::Data,
481        }
482    }
483
484    fn state_comment(&mut self, c: char) {
485        match self.state {
486            TokenizerState::CommentStartDash => {
487                if c == '-' {
488                    self.state = TokenizerState::Comment;
489                    self.current_token = Some(Token::Comment(String::new()));
490                } else {
491                    self.state = TokenizerState::BogusComment;
492                }
493            }
494            TokenizerState::Comment => {
495                if c == '-' {
496                    self.state = TokenizerState::CommentEndDash;
497                } else if let Some(Token::Comment(comment)) = &mut self.current_token {
498                    comment.push(c);
499                }
500            }
501            TokenizerState::CommentEndDash => {
502                if c == '-' {
503                    self.state = TokenizerState::CommentEnd;
504                } else {
505                    self.state = TokenizerState::Comment;
506                    if let Some(Token::Comment(comment)) = &mut self.current_token {
507                        comment.push('-');
508                        comment.push(c);
509                    }
510                }
511            }
512            TokenizerState::CommentEnd => {
513                if c == '>' {
514                    self.commit_token();
515                    self.state = TokenizerState::Data;
516                } else {
517                    self.state = TokenizerState::Comment;
518                    if let Some(Token::Comment(comment)) = &mut self.current_token {
519                        comment.push_str("--");
520                        comment.push(c);
521                    }
522                }
523            }
524            _ => {}
525        }
526    }
527
528    fn state_doctype(&mut self, c: char) {
529        match c {
530            c if c.is_whitespace() => match self.state {
531                TokenizerState::Doctype => self.state = TokenizerState::DoctypeName,
532                TokenizerState::DoctypeName => {
533                    if self.input[self.pos..].to_lowercase().starts_with("public")
534                        || self.input[self.pos..].to_lowercase().starts_with("system")
535                    {
536                        self.pos += 6;
537                        self.state = TokenizerState::BeforeDoctypePublicId;
538                    }
539                }
540                TokenizerState::AfterDoctypePublicId => {
541                    self.state = TokenizerState::DoctypeSystemId;
542                }
543                _ => {}
544            },
545            '>' => {
546                if let Some(Token::Doctype { force_quirks, .. }) = &mut self.current_token
547                    && self.state == TokenizerState::BogusDoctype
548                {
549                    *force_quirks = true;
550                }
551                self.commit_token();
552                self.state = TokenizerState::Data;
553            }
554            _ => {
555                self.buffer.push(c);
556                match self.state {
557                    TokenizerState::Doctype => self.state = TokenizerState::BogusDoctype,
558                    TokenizerState::DoctypeName => {
559                        if let Some(Token::Doctype { name, .. }) = &mut self.current_token {
560                            if name.is_none() {
561                                *name = Some(c.to_string());
562                            } else if let Some(n) = name {
563                                n.push(c);
564                            }
565                        }
566                    }
567                    TokenizerState::BeforeDoctypePublicId => {
568                        match c {
569                            '"' => self.state = TokenizerState::DoctypePublicIdWithDoubleQuote,
570                            '\'' => self.state = TokenizerState::DoctypePublicIdWithSingleQuote,
571                            _ if c.is_whitespace() => {}
572                            _ => self.state = TokenizerState::BogusDoctype,
573                        }
574                        if let Some(Token::Doctype { public_id, .. }) = &mut self.current_token {
575                            *public_id = Some(c.to_string());
576                        }
577                    }
578                    TokenizerState::DoctypePublicIdWithSingleQuote
579                    | TokenizerState::DoctypePublicIdWithDoubleQuote => {
580                        if let Some(Token::Doctype { public_id, .. }) = &mut self.current_token
581                            && let Some(pid) = public_id
582                        {
583                            pid.push(c);
584                        }
585                        if (self.state == TokenizerState::DoctypePublicIdWithSingleQuote
586                            && c == '\'')
587                            || (self.state == TokenizerState::DoctypePublicIdWithDoubleQuote
588                                && c == '"')
589                        {
590                            self.state = TokenizerState::AfterDoctypePublicId;
591                        }
592                    }
593                    TokenizerState::DoctypeSystemId => {
594                        if let Some(Token::Doctype { system_id, .. }) = &mut self.current_token {
595                            if system_id.is_none() {
596                                *system_id = Some(c.to_string());
597                            } else if let Some(sid) = system_id {
598                                sid.push(c);
599                            }
600                        }
601                    }
602                    _ => {}
603                }
604            }
605        }
606    }
607}
608
609#[cfg(test)]
610mod tests {
611    use super::*;
612
613    fn collect_tokens(input: &str) -> Vec<Token> {
614        let mut tokenizer = Tokenizer::new(input);
615        let mut tokens = Vec::new();
616        while let Some(token) = tokenizer.next_token() {
617            tokens.push(token);
618        }
619        tokens
620    }
621
622    #[test]
623    fn test_text_node() {
624        let input = "Hello, world!";
625        let tokens = collect_tokens(input);
626        assert_eq!(tokens, vec![Token::Text("Hello, world!".to_string())]);
627    }
628
629    #[test]
630    fn test_simple_tag() {
631        let input = "<div></div>";
632        let tokens = collect_tokens(input);
633        assert_eq!(
634            tokens,
635            vec![
636                Token::StartTag {
637                    name: "div".to_string(),
638                    attributes: vec![],
639                    self_closing: false
640                },
641                Token::EndTag {
642                    name: "div".to_string()
643                }
644            ]
645        );
646    }
647
648    #[test]
649    fn test_tag_with_attributes() {
650        let input = r#"<a href="https://example.com" target='_blank'>Link</a>"#;
651        let tokens = collect_tokens(input);
652        assert_eq!(
653            tokens,
654            vec![
655                Token::StartTag {
656                    name: "a".to_string(),
657                    attributes: vec![
658                        Attribute {
659                            name: "href".to_string(),
660                            value: "https://example.com".to_string()
661                        },
662                        Attribute {
663                            name: "target".to_string(),
664                            value: "_blank".to_string()
665                        },
666                    ],
667                    self_closing: false
668                },
669                Token::Text("Link".to_string()),
670                Token::EndTag {
671                    name: "a".to_string()
672                }
673            ]
674        );
675    }
676
677    #[test]
678    fn test_self_closing_tag() {
679        let input = "<img src='image.png'/>";
680        let tokens = collect_tokens(input);
681        assert_eq!(
682            tokens,
683            vec![Token::StartTag {
684                name: "img".to_string(),
685                attributes: vec![Attribute {
686                    name: "src".to_string(),
687                    value: "image.png".to_string()
688                }],
689                self_closing: true
690            }]
691        );
692    }
693
694    #[test]
695    fn test_comment() {
696        let input = "<!-- This is a comment -->";
697        let tokens = collect_tokens(input);
698        assert_eq!(
699            tokens,
700            vec![Token::Comment(" This is a comment ".to_string())]
701        );
702    }
703
704    #[test]
705    fn test_doctype() {
706        let input = "<!DOCTYPE html>";
707        let tokens = collect_tokens(input);
708        assert_eq!(
709            tokens,
710            vec![Token::Doctype {
711                name: Some("html".to_string()),
712                public_id: None,
713                system_id: None,
714                force_quirks: false
715            }]
716        );
717    }
718
719    #[test]
720    fn test_escape_entity() {
721        let input = "Hello &amp; goodbye";
722        let tokens = collect_tokens(input);
723        assert_eq!(tokens, vec![Token::Text("Hello & goodbye".to_string())]);
724    }
725
726    #[test]
727    fn test_nested_tags() {
728        let input = "<div><span>Text</span></div>";
729        let tokens = collect_tokens(input);
730        assert_eq!(
731            tokens,
732            vec![
733                Token::StartTag {
734                    name: "div".to_string(),
735                    attributes: vec![],
736                    self_closing: false
737                },
738                Token::StartTag {
739                    name: "span".to_string(),
740                    attributes: vec![],
741                    self_closing: false
742                },
743                Token::Text("Text".to_string()),
744                Token::EndTag {
745                    name: "span".to_string()
746                },
747                Token::EndTag {
748                    name: "div".to_string()
749                },
750            ]
751        );
752    }
753}