orinium_browser/engine/css/
tokenizer.rs

1//! CSS Tokenizer
2//!
3//! This module implements a **CSS tokenizer**, responsible for converting
4//! a raw CSS source string into a flat stream of tokens.
5//!
6//! ## Responsibilities
7//!
8//! - Consume raw characters
9//! - Produce syntactic tokens defined by the CSS specification
10//! - Preserve the original structure of the input as much as possible
11//!
12//! ## Non-responsibilities
13//!
14//! - Parsing selectors or declarations
15//! - Interpreting values (lengths, colors, percentages, etc.)
16//! - Building trees or nested structures
17//!
18//! ## Design notes
19//!
20//! - Tokens are produced in a **linear stream**
21//! - Function tokens only represent the function name
22//! - Matching of parentheses and function arguments is handled by the parser
23
24/// CSS token produced by the tokenizer.
25///
26/// This represents *syntactic units* only.
27/// No semantic interpretation (length, color, etc.) is performed here.
28#[derive(Debug, Clone, PartialEq)]
29pub enum Token {
30    /// Identifier token (e.g. `div`, `color`, `--custom`)
31    Ident(String),
32
33    /// Function token (e.g. `calc`, `var`)
34    Function(String),
35
36    /// Plain number without unit (e.g. `0`, `1.5`)
37    Number(f32),
38
39    /// Quoted string token (e.g. `"hello"`, `'world'`)
40    String(String),
41
42    /// Dimension token (e.g. `10px`, `50%`, `2em`)
43    ///
44    /// Percentages are also represented as a dimension
45    /// with `%` as the unit.
46    Dimension(f32, String),
47
48    /// Delimiter token (single-character symbols such as `:`, `;`, `>`, `+`)
49    Delim(char),
50
51    /// Hash with String (e.g. `#fff`)
52    Hash(String),
53
54    /// AtKeyword (e.g. `@media`)
55    AtKeyword(String),
56
57    /// One or more whitespace characters
58    Whitespace,
59
60    /// Comment
61    Comment(String),
62
63    /// End-of-input marker
64    EOF,
65}
66
67/// CSS tokenizer.
68///
69/// This struct is responsible for converting a CSS source string
70/// into a stream of `Token`s.
71///
72/// Responsibilities:
73/// - Consume raw characters
74/// - Produce syntactic tokens
75///
76/// Non-responsibilities:
77/// - Parsing declarations or selectors
78/// - Interpreting values (length, color, etc.)
79/// - Building trees or higher-level structures
80pub struct Tokenizer<'a> {
81    /// Iterator over the input characters
82    chars: std::str::Chars<'a>,
83
84    /// Current character under examination
85    current: Option<char>,
86}
87
88impl<'a> Tokenizer<'a> {
89    /// Create a new tokenizer from a CSS source string.
90    pub fn new(input: &'a str) -> Self {
91        let mut chars = input.chars();
92        let current = chars.next();
93
94        Self { chars, current }
95    }
96
97    /// Advance to the next character.
98    ///
99    /// This method should update `self.current`.
100    fn bump(&mut self) {
101        self.current = self.chars.next();
102    }
103
104    /// Peek the current character without consuming it.
105    fn peek(&self) -> Option<char> {
106        self.current
107    }
108
109    /// Peek the next character from the current one without consuming it.
110    fn peek_next(&self) -> Option<char> {
111        self.chars.clone().next()
112    }
113
114    /// Consume and return the next token from the input.
115    ///
116    /// This is the main entry point used by the parser.
117    pub fn next_token(&mut self) -> Token {
118        let token = match self.peek() {
119            Some(c) if c.is_whitespace() => self.consume_whitespace(),
120            Some(c) if is_number_start(c, self.peek_next()) => self.consume_number_like(),
121            Some(c) if is_ident_start(c) => self.consume_ident_like(),
122            Some(c) if is_string_delimiter(c) => self.consume_string_like(),
123            Some('/') => {
124                if self.peek_next() == Some('*') {
125                    self.bump(); // consume '/'
126                    self.bump(); // consume '*'
127                    self.consume_comment()
128                } else {
129                    self.bump();
130                    Token::Delim('/')
131                }
132            }
133            Some('#') => {
134                self.bump(); // consume '#'
135                let mut value = String::new();
136                while let Some(c) = self.peek() {
137                    if is_ident_continue(c) {
138                        value.push(c);
139                        self.bump();
140                    } else {
141                        break;
142                    }
143                }
144                Token::Hash(value)
145            }
146            Some('@') => {
147                self.bump();
148                let mut value = String::new();
149                while let Some(c) = self.peek() {
150                    if is_ident_continue(c) {
151                        value.push(c);
152                        self.bump();
153                    } else {
154                        break;
155                    }
156                }
157                Token::AtKeyword(value)
158            }
159            Some(c) => {
160                self.bump();
161                Token::Delim(c)
162            }
163            None => Token::EOF,
164        };
165
166        log::debug!(target: "CssTokenizer", "Tokenized: {:?}", token);
167
168        token
169    }
170
171    /// Consume consecutive whitespace characters.
172    ///
173    /// Produces a single `Token::Whitespace`.
174    fn consume_whitespace(&mut self) -> Token {
175        while matches!(self.current, Some(c) if c.is_whitespace()) {
176            self.bump();
177        }
178        Token::Whitespace
179    }
180
181    /// Consume an identifier or function token.
182    ///
183    /// If an identifier is immediately followed by `(`,
184    /// this method should produce a `Token::Function`.
185    fn consume_ident_like(&mut self) -> Token {
186        let mut ident = String::new();
187
188        while let Some(c) = self.peek() {
189            if c == '\\' {
190                if let Some(escaped) = self.consume_escape() {
191                    ident.push(escaped);
192                }
193            } else if is_ident_continue(c) {
194                ident.push(c);
195                self.bump();
196            } else {
197                break;
198            }
199        }
200        if self.peek() == Some('(') {
201            Token::Function(ident)
202        } else {
203            Token::Ident(ident)
204        }
205    }
206
207    fn consume_string_like(&mut self) -> Token {
208        let quote = self.peek().unwrap(); // '"' or '\''
209        self.bump(); // consume opening quote
210
211        let mut value = String::new();
212
213        while let Some(c) = self.peek() {
214            if c == quote {
215                self.bump(); // consume closing quote
216                break;
217            }
218
219            if c == '\\' {
220                if let Some(escaped) = self.consume_escape() {
221                    value.push(escaped);
222                }
223                continue;
224            }
225
226            value.push(c);
227            self.bump();
228        }
229
230        Token::String(value)
231    }
232
233    /// Consume a number-like token.
234    ///
235    /// This may produce:
236    /// - `Token::Number`
237    /// - `Token::Dimension` (including `%`)
238    fn consume_number_like(&mut self) -> Token {
239        let mut buf = String::new();
240
241        let mut has_dot = if self.peek() == Some('.') {
242            buf.push('.');
243            self.bump();
244            true
245        } else {
246            false
247        };
248
249        if self.peek() == Some('-') {
250            buf.push('-');
251            self.bump();
252        }
253
254        while let Some(c) = self.peek() {
255            if c.is_ascii_digit() {
256                buf.push(c);
257                self.bump();
258            } else if c == '.' && !has_dot {
259                has_dot = true;
260                buf.push(c);
261                self.bump();
262            } else {
263                break;
264            }
265        }
266
267        let value: f32 = buf.parse().unwrap_or(0.0);
268
269        // --- unit / percentage branching ---
270        match self.peek() {
271            Some('%') => {
272                self.bump();
273                Token::Dimension(value, "%".to_string())
274            }
275            Some(c) if is_ident_start(c) => {
276                let mut unit = String::new();
277                while let Some(c) = self.peek() {
278                    if is_ident_continue(c) {
279                        unit.push(c);
280                        self.bump();
281                    } else {
282                        break;
283                    }
284                }
285                Token::Dimension(value, unit)
286            }
287            _ => Token::Number(value),
288        }
289    }
290
291    /// Consume a CSS comment.
292    ///
293    /// Assumes the opening `/*` has already been consumed.
294    fn consume_comment(&mut self) -> Token {
295        let mut value = String::new();
296
297        while let Some(c) = self.peek() {
298            if c == '*' && self.peek_next() == Some('/') {
299                self.bump(); // consume '*'
300                self.bump(); // consume '/'
301                break;
302            } else {
303                value.push(c);
304                self.bump();
305            }
306        }
307
308        Token::Comment(value)
309    }
310
311    fn consume_escape(&mut self) -> Option<char> {
312        self.bump(); // consume '\'
313
314        // 1. Line continuation: backslash + newline => nothing
315        match self.peek() {
316            Some('\n') => {
317                self.bump();
318                return None;
319            }
320            Some('\r') => {
321                self.bump();
322                if self.peek() == Some('\n') {
323                    self.bump(); // CRLF
324                }
325                return None;
326            }
327            _ => {}
328        }
329
330        // 2. Unicode escape
331        let mut hex = String::new();
332        for _ in 0..6 {
333            match self.peek() {
334                Some(c) if c.is_ascii_hexdigit() => {
335                    hex.push(c);
336                    self.bump();
337                }
338                _ => break,
339            }
340        }
341
342        if !hex.is_empty() {
343            if matches!(self.peek(), Some(c) if c.is_whitespace()) {
344                self.bump(); // optional whitespace
345            }
346
347            let code = u32::from_str_radix(&hex, 16).ok()?;
348            return std::char::from_u32(code).or(Some('\u{FFFD}'));
349        }
350
351        // 3. Simple escape
352        if let Some(c) = self.peek() {
353            self.bump();
354            Some(c)
355        } else {
356            None
357        }
358    }
359}
360
361/// Returns true if the character can start an identifier.
362///
363/// This is a simplified CSS identifier start check.
364/// It supports:
365/// - ASCII letters (A–Z, a–z)
366/// - underscore (`_`)
367/// - hyphen (`-`)
368/// - non-ASCII characters
369fn is_ident_start(c: char) -> bool {
370    c.is_ascii_alphabetic() || c == '\\' || c == '_' || c == '-' || !c.is_ascii()
371}
372
373/// Returns true if the character is a CSS string delimiter.
374///
375/// CSS strings are delimited by either double quotes (`"`)
376/// or single quotes (`'`).
377fn is_string_delimiter(c: char) -> bool {
378    matches!(c, '"' | '\'')
379}
380
381/// Returns true if the character can continue an identifier.
382///
383/// - ASCII letters (A–Z, a–z)
384/// - ASCII digits (0–9)
385/// - Underscore (`_`)
386/// - Hyphen (`-`)
387/// - Non-ASCII characters
388fn is_ident_continue(c: char) -> bool {
389    c.is_ascii_alphanumeric() || c == '_' || c == '-' || !c.is_ascii()
390}
391
392/// Returns true if the character is a CSS number start.
393///
394/// - ASCII digits (0-9)
395/// - A dot followed by a digit (e.g. `.5`)
396/// - A hyphen followed by a digit or dot (e.g. `-1`, `-.5`)
397fn is_number_start(current: char, next: Option<char>) -> bool {
398    current.is_ascii_digit()
399        || (current == '.' && matches!(next, Some(c) if c.is_ascii_digit()))
400        || (current == '-' && matches!(next, Some(c) if c.is_ascii_digit() || c == '.'))
401}