1use super::util::decode_entity;
4
5#[derive(Debug, Clone, PartialEq)]
7pub struct Attribute {
8 pub name: String,
9 pub value: String,
10}
11
12#[derive(Debug, Clone, PartialEq)]
14pub enum Token {
15 Doctype {
16 name: Option<String>,
17 public_id: Option<String>,
18 system_id: Option<String>,
19 force_quirks: bool,
20 },
21 StartTag {
22 name: String,
23 attributes: Vec<Attribute>,
24 self_closing: bool,
25 },
26 EndTag {
27 name: String,
28 },
29 Comment(String),
30 Text(String),
31}
32
33#[derive(Debug, PartialEq)]
35pub enum TokenizerState {
36 Data,
37 ScriptData,
38 StyleData,
39 EscapeDecoding,
40 TagOpen,
41 EndTagOpen,
42 TagName,
43 BeforeAttributeName,
44 AttributeName,
45 AfterAttributeName,
46 BeforeAttributeValue,
47 AttributeValueDoubleQuoted,
48 AttributeValueSingleQuoted,
49 AttributeValueUnquoted,
50 SelfClosingStartTag,
51 CommentStartDash,
52 Comment,
53 CommentEndDash,
54 CommentEnd,
55 BogusComment,
56 Doctype,
57 DoctypeName,
58 BeforeDoctypePublicId,
59 DoctypePublicIdWithSingleQuote,
60 DoctypePublicIdWithDoubleQuote,
61 AfterDoctypePublicId,
62 DoctypeSystemId,
63 BogusDoctype,
64}
65
66impl TokenizerState {
67 fn is_doctype(&self) -> bool {
69 matches!(
70 self,
71 TokenizerState::Doctype
72 | TokenizerState::DoctypeName
73 | TokenizerState::BeforeDoctypePublicId
74 | TokenizerState::DoctypePublicIdWithSingleQuote
75 | TokenizerState::DoctypePublicIdWithDoubleQuote
76 | TokenizerState::AfterDoctypePublicId
77 | TokenizerState::DoctypeSystemId
78 | TokenizerState::BogusDoctype
79 )
80 }
81
82 fn is_comment(&self) -> bool {
84 matches!(
85 self,
86 TokenizerState::Comment
87 | TokenizerState::CommentStartDash
88 | TokenizerState::CommentEndDash
89 | TokenizerState::CommentEnd
90 | TokenizerState::BogusComment
91 )
92 }
93}
94
95pub struct Tokenizer<'a> {
97 input: &'a str,
98 pos: usize,
99 token: Option<Token>,
100 state: TokenizerState,
101 current_token: Option<Token>,
102 current_attribute: Option<Attribute>,
103 buffer: String,
104}
105
106impl<'a> Tokenizer<'a> {
107 pub fn new(input: &'a str) -> Self {
109 Self {
110 input,
111 pos: 0,
112 token: None,
113 state: TokenizerState::Data,
114 current_token: None,
115 current_attribute: None,
116 buffer: String::new(),
117 }
118 }
119
120 fn next_char(&mut self) -> Option<char> {
122 if self.pos >= self.input.len() {
123 None
124 } else {
125 let c = self.input[self.pos..].chars().next().unwrap();
126 self.pos += c.len_utf8();
127 Some(c)
128 }
129 }
130
131 fn commit_token(&mut self) {
133 self.token = self.current_token.take();
134 self.buffer.clear();
135 }
136
137 fn push_current_attribute(&mut self) {
139 if let (Some(attr), Some(Token::StartTag { attributes, .. })) =
140 (self.current_attribute.take(), &mut self.current_token)
141 {
142 attributes.push(attr);
143 }
144 }
145
146 fn handle_special_tag_state_transition(&mut self, token: &Token) {
147 if let Token::StartTag { name, .. } = token {
148 match name.to_lowercase().as_str() {
150 "script" => self.state = TokenizerState::ScriptData,
151 "style" => self.state = TokenizerState::StyleData,
152 _ => self.state = TokenizerState::Data,
153 }
154 } else {
155 }
157 }
158
159 #[inline(always)]
161 fn debug_emit(&self, token: &Token) {
162 #[cfg(debug_assertions)]
163 match token {
164 Token::StartTag { name, .. } => {
165 log::debug!(target:"HtmlTokenizer::EmitToken::TagStart", "Emitting token: {name}, Pos: {}", self.pos)
166 }
167 Token::EndTag { name } => {
168 log::debug!(target:"HtmlTokenizer::EmitToken::TagEnd", "Emitting token: {name}, Pos: {}", self.pos)
169 }
170 Token::Comment(comment) => {
171 log::debug!(target:"HtmlTokenizer::EmitToken::Comment", "Emitting token: {}, Pos: {}", comment, self.pos)
172 }
173 Token::Text(text) => {
174 log::debug!(target:"HtmlTokenizer::EmitToken::Text", "Emitting token: `{text}`, Pos: {}", self.pos)
175 }
176 _ => {}
177 }
178 }
179
180 pub fn next_token(&mut self) -> Option<Token> {
182 while let Some(c) = self.next_char() {
183 log::debug!(target:"HtmlTokenizer::Char", "State: {:?}, Char: '{}'", self.state, c);
184
185 match self.state {
186 TokenizerState::Data | TokenizerState::StyleData | TokenizerState::ScriptData => {
187 self.state_data(c)
188 }
189 TokenizerState::EscapeDecoding => self.state_escape_decoding(c),
190 _ if self.state.is_doctype() => self.state_doctype(c),
191 TokenizerState::TagOpen => self.state_tag_open(c),
192 TokenizerState::TagName => self.state_tag_name(c),
193 TokenizerState::BeforeAttributeName => self.state_before_attribute_name(c),
194 TokenizerState::AttributeName => self.state_attribute_name(c),
195 TokenizerState::BeforeAttributeValue => self.state_before_attribute_value(c),
196 TokenizerState::AttributeValueDoubleQuoted
197 | TokenizerState::AttributeValueSingleQuoted => {
198 self.state_attribute_value_quoted(c)
199 }
200 TokenizerState::AfterAttributeName => self.state_after_attribute_name(c),
201 TokenizerState::AttributeValueUnquoted => self.state_attribute_value_unquoted(c),
202 TokenizerState::SelfClosingStartTag => self.state_self_closing_start_tag(c),
203 TokenizerState::EndTagOpen => self.state_end_tag_open(c),
204 _ if self.state.is_comment() => self.state_comment(c),
205 _ => {
206 log::error!(target:"HtmlTokenizer::State", "Unimplemented state: {:?}, returning to Data state", self.state);
207 self.state = TokenizerState::Data;
208 }
209 }
210
211 if let Some(token) = self.token.take() {
212 self.debug_emit(&token);
213 self.handle_special_tag_state_transition(&token);
214 return Some(token);
215 }
216 }
217
218 if self.current_token.is_some() {
220 self.commit_token();
221 return self.token.take();
222 }
223
224 if self.state.is_comment() {
226 self.state = TokenizerState::BogusComment;
227 self.commit_token();
228 return self.token.take();
229 }
230
231 None
232 }
233
234 fn state_data(&mut self, c: char) {
236 match c {
237 '<' => {
238 self.commit_token();
239 self.state = TokenizerState::TagOpen;
240 }
241 '&' if self.state == TokenizerState::Data => {
243 self.buffer.push('&');
244 self.state = TokenizerState::EscapeDecoding;
245 }
246 _ => {
247 self.buffer.push(c);
248 match &mut self.current_token {
249 Some(Token::Text(text)) => text.push(c),
250 _ => self.current_token = Some(Token::Text(c.to_string())),
251 }
252 }
253 }
254 }
255
256 fn state_escape_decoding(&mut self, c: char) {
257 if c == ';' {
258 let mut iter = self.buffer.rsplitn(2, '&');
259 let entity = iter.next().unwrap_or("");
260
261 let decoded = decode_entity(entity).unwrap_or_else(|| format!("&{};", entity));
262
263 match &mut self.current_token {
264 Some(Token::Text(text)) => text.push_str(&decoded),
265 _ => self.current_token = Some(Token::Text(decoded)),
266 }
267
268 self.buffer.clear();
269 self.state = TokenizerState::Data;
270 } else {
271 self.buffer.push(c);
272 }
273 }
274
275 fn state_tag_open(&mut self, c: char) {
276 match c {
277 '/' => self.state = TokenizerState::EndTagOpen,
278 '!' => {
279 if self.input[self.pos..].starts_with('-') {
280 self.pos += 1;
281 self.state = TokenizerState::CommentStartDash;
282 } else if self.input[self.pos..].to_lowercase().starts_with("doctype") {
283 self.pos += 7;
284 self.state = TokenizerState::Doctype;
285 self.current_token = Some(Token::Doctype {
286 name: None,
287 public_id: None,
288 system_id: None,
289 force_quirks: false,
290 });
291 } else {
292 self.state = TokenizerState::BogusComment;
293 }
294 }
295 c if c.is_ascii_alphabetic() => {
296 self.state = TokenizerState::TagName;
297 self.buffer.push(c);
298 self.current_token = Some(Token::StartTag {
299 name: c.to_string(),
300 attributes: Vec::new(),
301 self_closing: false,
302 });
303 }
304 _ => {
305 self.buffer.push('<');
306 self.buffer.push(c);
307 match &mut self.current_token {
308 Some(Token::Text(text)) => {
309 text.push('<');
310 text.push(c);
311 }
312 _ => self.current_token = Some(Token::Text(format!("<{c}"))),
313 }
314 self.state = TokenizerState::Data;
315 }
316 }
317 }
318
319 fn state_tag_name(&mut self, c: char) {
320 match c {
321 c if c.is_whitespace() => self.state = TokenizerState::BeforeAttributeName,
322 '/' => self.state = TokenizerState::SelfClosingStartTag,
323 '>' => {
324 self.commit_token();
325 self.state = TokenizerState::Data;
326 }
327 c if c.is_ascii_alphanumeric() || c == '-' || c == '_' || c == ':' => {
328 self.buffer.push(c);
329 match &mut self.current_token {
330 Some(Token::StartTag { name, .. }) => name.push(c),
331 Some(Token::EndTag { name }) => name.push(c),
332 _ => {}
333 }
334 }
335 _ => {
336 self.commit_token();
337 self.state = TokenizerState::Data;
338 }
339 }
340 }
341
342 fn state_before_attribute_name(&mut self, c: char) {
343 match c {
344 c if c.is_whitespace() => {}
345 '/' => self.state = TokenizerState::SelfClosingStartTag,
346 '>' => {
347 self.commit_token();
348 self.state = TokenizerState::Data;
349 }
350 c if c.is_ascii_alphanumeric() => {
351 self.state = TokenizerState::AttributeName;
352 self.buffer.push(c);
353 self.current_attribute = Some(Attribute {
354 name: c.to_string(),
355 value: String::new(),
356 });
357 }
358 _ => {}
359 }
360 }
361
362 fn state_attribute_name(&mut self, c: char) {
363 match c {
364 c if c.is_whitespace() => self.state = TokenizerState::AfterAttributeName,
365 '=' => self.state = TokenizerState::BeforeAttributeValue,
366 '/' => {
367 self.push_current_attribute();
368 self.state = TokenizerState::SelfClosingStartTag;
369 }
370 '>' => {
371 self.push_current_attribute();
372 self.commit_token();
373 self.state = TokenizerState::Data;
374 }
375 c if c.is_ascii_alphanumeric() || c == '-' || c == '_' || c == ':' => {
376 self.buffer.push(c);
377 if let Some(attr) = &mut self.current_attribute {
378 attr.name.push(c);
379 }
380 }
381 _ => {}
382 }
383 }
384
385 fn state_before_attribute_value(&mut self, c: char) {
386 match c {
387 c if c.is_whitespace() => {}
388 '"' => self.state = TokenizerState::AttributeValueDoubleQuoted,
389 '\'' => self.state = TokenizerState::AttributeValueSingleQuoted,
390 '>' => {
391 self.push_current_attribute();
392 self.commit_token();
393 self.state = TokenizerState::Data;
394 }
395 _ => {
396 self.state = TokenizerState::AttributeValueUnquoted;
397 if let Some(attr) = &mut self.current_attribute {
398 attr.value.push(c);
399 }
400 }
401 }
402 }
403
404 fn state_attribute_value_quoted(&mut self, c: char) {
405 match (&self.state, c) {
406 (&TokenizerState::AttributeValueDoubleQuoted, '"')
407 | (&TokenizerState::AttributeValueSingleQuoted, '\'') => {
408 self.push_current_attribute();
409 self.state = TokenizerState::AfterAttributeName;
410 }
411 _ => {
412 if let Some(attr) = &mut self.current_attribute {
413 attr.value.push(c);
414 }
415 }
416 }
417 }
418
419 fn state_after_attribute_name(&mut self, c: char) {
420 match c {
421 c if c.is_whitespace() => {}
422 '/' => self.state = TokenizerState::SelfClosingStartTag,
423 '>' => {
424 self.commit_token();
425 self.state = TokenizerState::Data;
426 }
427 c if c.is_ascii_alphanumeric() => {
428 self.state = TokenizerState::AttributeName;
429 self.buffer.push(c);
430 self.current_attribute = Some(Attribute {
431 name: c.to_string(),
432 value: String::new(),
433 });
434 }
435 _ => {}
436 }
437 }
438
439 fn state_attribute_value_unquoted(&mut self, c: char) {
440 match c {
441 c if c.is_whitespace() => {
442 self.push_current_attribute();
443 self.state = TokenizerState::BeforeAttributeName;
444 }
445 '>' => {
446 self.push_current_attribute();
447 self.commit_token();
448 self.state = TokenizerState::Data;
449 }
450 _ => {
451 if let Some(attr) = &mut self.current_attribute {
452 attr.value.push(c);
453 }
454 }
455 }
456 }
457
458 fn state_self_closing_start_tag(&mut self, c: char) {
459 match c {
460 '>' => {
461 if let Some(Token::StartTag { self_closing, .. }) = &mut self.current_token {
462 *self_closing = true;
463 }
464 self.commit_token();
465 self.state = TokenizerState::Data;
466 }
467 _ => self.state = TokenizerState::Data,
468 }
469 }
470
471 fn state_end_tag_open(&mut self, c: char) {
472 match c {
473 c if c.is_ascii_alphabetic() => {
474 self.state = TokenizerState::TagName;
475 self.buffer.push(c);
476 self.current_token = Some(Token::EndTag {
477 name: c.to_string(),
478 });
479 }
480 _ => self.state = TokenizerState::Data,
481 }
482 }
483
484 fn state_comment(&mut self, c: char) {
485 match self.state {
486 TokenizerState::CommentStartDash => {
487 if c == '-' {
488 self.state = TokenizerState::Comment;
489 self.current_token = Some(Token::Comment(String::new()));
490 } else {
491 self.state = TokenizerState::BogusComment;
492 }
493 }
494 TokenizerState::Comment => {
495 if c == '-' {
496 self.state = TokenizerState::CommentEndDash;
497 } else if let Some(Token::Comment(comment)) = &mut self.current_token {
498 comment.push(c);
499 }
500 }
501 TokenizerState::CommentEndDash => {
502 if c == '-' {
503 self.state = TokenizerState::CommentEnd;
504 } else {
505 self.state = TokenizerState::Comment;
506 if let Some(Token::Comment(comment)) = &mut self.current_token {
507 comment.push('-');
508 comment.push(c);
509 }
510 }
511 }
512 TokenizerState::CommentEnd => {
513 if c == '>' {
514 self.commit_token();
515 self.state = TokenizerState::Data;
516 } else {
517 self.state = TokenizerState::Comment;
518 if let Some(Token::Comment(comment)) = &mut self.current_token {
519 comment.push_str("--");
520 comment.push(c);
521 }
522 }
523 }
524 _ => {}
525 }
526 }
527
528 fn state_doctype(&mut self, c: char) {
529 match c {
530 c if c.is_whitespace() => match self.state {
531 TokenizerState::Doctype => self.state = TokenizerState::DoctypeName,
532 TokenizerState::DoctypeName => {
533 if self.input[self.pos..].to_lowercase().starts_with("public")
534 || self.input[self.pos..].to_lowercase().starts_with("system")
535 {
536 self.pos += 6;
537 self.state = TokenizerState::BeforeDoctypePublicId;
538 }
539 }
540 TokenizerState::AfterDoctypePublicId => {
541 self.state = TokenizerState::DoctypeSystemId;
542 }
543 _ => {}
544 },
545 '>' => {
546 if let Some(Token::Doctype { force_quirks, .. }) = &mut self.current_token
547 && self.state == TokenizerState::BogusDoctype
548 {
549 *force_quirks = true;
550 }
551 self.commit_token();
552 self.state = TokenizerState::Data;
553 }
554 _ => {
555 self.buffer.push(c);
556 match self.state {
557 TokenizerState::Doctype => self.state = TokenizerState::BogusDoctype,
558 TokenizerState::DoctypeName => {
559 if let Some(Token::Doctype { name, .. }) = &mut self.current_token {
560 if name.is_none() {
561 *name = Some(c.to_string());
562 } else if let Some(n) = name {
563 n.push(c);
564 }
565 }
566 }
567 TokenizerState::BeforeDoctypePublicId => {
568 match c {
569 '"' => self.state = TokenizerState::DoctypePublicIdWithDoubleQuote,
570 '\'' => self.state = TokenizerState::DoctypePublicIdWithSingleQuote,
571 _ if c.is_whitespace() => {}
572 _ => self.state = TokenizerState::BogusDoctype,
573 }
574 if let Some(Token::Doctype { public_id, .. }) = &mut self.current_token {
575 *public_id = Some(c.to_string());
576 }
577 }
578 TokenizerState::DoctypePublicIdWithSingleQuote
579 | TokenizerState::DoctypePublicIdWithDoubleQuote => {
580 if let Some(Token::Doctype { public_id, .. }) = &mut self.current_token
581 && let Some(pid) = public_id
582 {
583 pid.push(c);
584 }
585 if (self.state == TokenizerState::DoctypePublicIdWithSingleQuote
586 && c == '\'')
587 || (self.state == TokenizerState::DoctypePublicIdWithDoubleQuote
588 && c == '"')
589 {
590 self.state = TokenizerState::AfterDoctypePublicId;
591 }
592 }
593 TokenizerState::DoctypeSystemId => {
594 if let Some(Token::Doctype { system_id, .. }) = &mut self.current_token {
595 if system_id.is_none() {
596 *system_id = Some(c.to_string());
597 } else if let Some(sid) = system_id {
598 sid.push(c);
599 }
600 }
601 }
602 _ => {}
603 }
604 }
605 }
606 }
607}
608
609#[cfg(test)]
610mod tests {
611 use super::*;
612
613 fn collect_tokens(input: &str) -> Vec<Token> {
614 let mut tokenizer = Tokenizer::new(input);
615 let mut tokens = Vec::new();
616 while let Some(token) = tokenizer.next_token() {
617 tokens.push(token);
618 }
619 tokens
620 }
621
622 #[test]
623 fn test_text_node() {
624 let input = "Hello, world!";
625 let tokens = collect_tokens(input);
626 assert_eq!(tokens, vec![Token::Text("Hello, world!".to_string())]);
627 }
628
629 #[test]
630 fn test_simple_tag() {
631 let input = "<div></div>";
632 let tokens = collect_tokens(input);
633 assert_eq!(
634 tokens,
635 vec![
636 Token::StartTag {
637 name: "div".to_string(),
638 attributes: vec![],
639 self_closing: false
640 },
641 Token::EndTag {
642 name: "div".to_string()
643 }
644 ]
645 );
646 }
647
648 #[test]
649 fn test_tag_with_attributes() {
650 let input = r#"<a href="https://example.com" target='_blank'>Link</a>"#;
651 let tokens = collect_tokens(input);
652 assert_eq!(
653 tokens,
654 vec![
655 Token::StartTag {
656 name: "a".to_string(),
657 attributes: vec![
658 Attribute {
659 name: "href".to_string(),
660 value: "https://example.com".to_string()
661 },
662 Attribute {
663 name: "target".to_string(),
664 value: "_blank".to_string()
665 },
666 ],
667 self_closing: false
668 },
669 Token::Text("Link".to_string()),
670 Token::EndTag {
671 name: "a".to_string()
672 }
673 ]
674 );
675 }
676
677 #[test]
678 fn test_self_closing_tag() {
679 let input = "<img src='image.png'/>";
680 let tokens = collect_tokens(input);
681 assert_eq!(
682 tokens,
683 vec![Token::StartTag {
684 name: "img".to_string(),
685 attributes: vec![Attribute {
686 name: "src".to_string(),
687 value: "image.png".to_string()
688 }],
689 self_closing: true
690 }]
691 );
692 }
693
694 #[test]
695 fn test_comment() {
696 let input = "<!-- This is a comment -->";
697 let tokens = collect_tokens(input);
698 assert_eq!(
699 tokens,
700 vec![Token::Comment(" This is a comment ".to_string())]
701 );
702 }
703
704 #[test]
705 fn test_doctype() {
706 let input = "<!DOCTYPE html>";
707 let tokens = collect_tokens(input);
708 assert_eq!(
709 tokens,
710 vec![Token::Doctype {
711 name: Some("html".to_string()),
712 public_id: None,
713 system_id: None,
714 force_quirks: false
715 }]
716 );
717 }
718
719 #[test]
720 fn test_escape_entity() {
721 let input = "Hello & goodbye";
722 let tokens = collect_tokens(input);
723 assert_eq!(tokens, vec![Token::Text("Hello & goodbye".to_string())]);
724 }
725
726 #[test]
727 fn test_nested_tags() {
728 let input = "<div><span>Text</span></div>";
729 let tokens = collect_tokens(input);
730 assert_eq!(
731 tokens,
732 vec![
733 Token::StartTag {
734 name: "div".to_string(),
735 attributes: vec![],
736 self_closing: false
737 },
738 Token::StartTag {
739 name: "span".to_string(),
740 attributes: vec![],
741 self_closing: false
742 },
743 Token::Text("Text".to_string()),
744 Token::EndTag {
745 name: "span".to_string()
746 },
747 Token::EndTag {
748 name: "div".to_string()
749 },
750 ]
751 );
752 }
753}