orinium_browser/engine/css/tokenizer.rs
1//! CSS Tokenizer
2//!
3//! This module implements a **CSS tokenizer**, responsible for converting
4//! a raw CSS source string into a flat stream of tokens.
5//!
6//! ## Responsibilities
7//!
8//! - Consume raw characters
9//! - Produce syntactic tokens defined by the CSS specification
10//! - Preserve the original structure of the input as much as possible
11//!
12//! ## Non-responsibilities
13//!
14//! - Parsing selectors or declarations
15//! - Interpreting values (lengths, colors, percentages, etc.)
16//! - Building trees or nested structures
17//!
18//! ## Design notes
19//!
20//! - Tokens are produced in a **linear stream**
21//! - Function tokens only represent the function name
22//! - Matching of parentheses and function arguments is handled by the parser
23
24/// CSS token produced by the tokenizer.
25///
26/// This represents *syntactic units* only.
27/// No semantic interpretation (length, color, etc.) is performed here.
28#[derive(Debug, Clone, PartialEq)]
29pub enum Token {
30 /// Identifier token (e.g. `div`, `color`, `--custom`)
31 Ident(String),
32
33 /// Function token (e.g. `calc`, `var`)
34 Function(String),
35
36 /// Plain number without unit (e.g. `0`, `1.5`)
37 Number(f32),
38
39 /// Quoted string token (e.g. `"hello"`, `'world'`)
40 String(String),
41
42 /// Dimension token (e.g. `10px`, `50%`, `2em`)
43 ///
44 /// Percentages are also represented as a dimension
45 /// with `%` as the unit.
46 Dimension(f32, String),
47
48 /// Delimiter token (single-character symbols such as `:`, `;`, `>`, `+`)
49 Delim(char),
50
51 /// Hash with String (e.g. `#fff`)
52 Hash(String),
53
54 /// AtKeyword (e.g. `@media`)
55 AtKeyword(String),
56
57 /// One or more whitespace characters
58 Whitespace,
59
60 /// Comment
61 Comment(String),
62
63 /// End-of-input marker
64 EOF,
65}
66
67/// CSS tokenizer.
68///
69/// This struct is responsible for converting a CSS source string
70/// into a stream of `Token`s.
71///
72/// Responsibilities:
73/// - Consume raw characters
74/// - Produce syntactic tokens
75///
76/// Non-responsibilities:
77/// - Parsing declarations or selectors
78/// - Interpreting values (length, color, etc.)
79/// - Building trees or higher-level structures
80pub struct Tokenizer<'a> {
81 /// Iterator over the input characters
82 chars: std::str::Chars<'a>,
83
84 /// Current character under examination
85 current: Option<char>,
86}
87
88impl<'a> Tokenizer<'a> {
89 /// Create a new tokenizer from a CSS source string.
90 pub fn new(input: &'a str) -> Self {
91 let mut chars = input.chars();
92 let current = chars.next();
93
94 Self { chars, current }
95 }
96
97 /// Advance to the next character.
98 ///
99 /// This method should update `self.current`.
100 fn bump(&mut self) {
101 self.current = self.chars.next();
102 }
103
104 /// Peek the current character without consuming it.
105 fn peek(&self) -> Option<char> {
106 self.current
107 }
108
109 /// Peek the next character from the current one without consuming it.
110 fn peek_next(&self) -> Option<char> {
111 self.chars.clone().next()
112 }
113
114 /// Consume and return the next token from the input.
115 ///
116 /// This is the main entry point used by the parser.
117 pub fn next_token(&mut self) -> Token {
118 let token = match self.peek() {
119 Some(c) if c.is_whitespace() => self.consume_whitespace(),
120 Some(c) if is_number_start(c, self.peek_next()) => self.consume_number_like(),
121 Some(c) if is_ident_start(c) => self.consume_ident_like(),
122 Some(c) if is_string_delimiter(c) => self.consume_string_like(),
123 Some('/') => {
124 if self.peek_next() == Some('*') {
125 self.bump(); // consume '/'
126 self.bump(); // consume '*'
127 self.consume_comment()
128 } else {
129 self.bump();
130 Token::Delim('/')
131 }
132 }
133 Some('#') => {
134 self.bump(); // consume '#'
135 let mut value = String::new();
136 while let Some(c) = self.peek() {
137 if is_ident_continue(c) {
138 value.push(c);
139 self.bump();
140 } else {
141 break;
142 }
143 }
144 Token::Hash(value)
145 }
146 Some('@') => {
147 self.bump();
148 let mut value = String::new();
149 while let Some(c) = self.peek() {
150 if is_ident_continue(c) {
151 value.push(c);
152 self.bump();
153 } else {
154 break;
155 }
156 }
157 Token::AtKeyword(value)
158 }
159 Some(c) => {
160 self.bump();
161 Token::Delim(c)
162 }
163 None => Token::EOF,
164 };
165
166 log::debug!(target: "CssTokenizer", "Tokenized: {:?}", token);
167
168 token
169 }
170
171 /// Consume consecutive whitespace characters.
172 ///
173 /// Produces a single `Token::Whitespace`.
174 fn consume_whitespace(&mut self) -> Token {
175 while matches!(self.current, Some(c) if c.is_whitespace()) {
176 self.bump();
177 }
178 Token::Whitespace
179 }
180
181 /// Consume an identifier or function token.
182 ///
183 /// If an identifier is immediately followed by `(`,
184 /// this method should produce a `Token::Function`.
185 fn consume_ident_like(&mut self) -> Token {
186 let mut ident = String::new();
187
188 while let Some(c) = self.peek() {
189 if c == '\\' {
190 if let Some(escaped) = self.consume_escape() {
191 ident.push(escaped);
192 }
193 } else if is_ident_continue(c) {
194 ident.push(c);
195 self.bump();
196 } else {
197 break;
198 }
199 }
200 if self.peek() == Some('(') {
201 Token::Function(ident)
202 } else {
203 Token::Ident(ident)
204 }
205 }
206
207 fn consume_string_like(&mut self) -> Token {
208 let quote = self.peek().unwrap(); // '"' or '\''
209 self.bump(); // consume opening quote
210
211 let mut value = String::new();
212
213 while let Some(c) = self.peek() {
214 if c == quote {
215 self.bump(); // consume closing quote
216 break;
217 }
218
219 if c == '\\' {
220 if let Some(escaped) = self.consume_escape() {
221 value.push(escaped);
222 }
223 continue;
224 }
225
226 value.push(c);
227 self.bump();
228 }
229
230 Token::String(value)
231 }
232
233 /// Consume a number-like token.
234 ///
235 /// This may produce:
236 /// - `Token::Number`
237 /// - `Token::Dimension` (including `%`)
238 fn consume_number_like(&mut self) -> Token {
239 let mut buf = String::new();
240
241 let mut has_dot = if self.peek() == Some('.') {
242 buf.push('.');
243 self.bump();
244 true
245 } else {
246 false
247 };
248
249 if self.peek() == Some('-') {
250 buf.push('-');
251 self.bump();
252 }
253
254 while let Some(c) = self.peek() {
255 if c.is_ascii_digit() {
256 buf.push(c);
257 self.bump();
258 } else if c == '.' && !has_dot {
259 has_dot = true;
260 buf.push(c);
261 self.bump();
262 } else {
263 break;
264 }
265 }
266
267 let value: f32 = buf.parse().unwrap_or(0.0);
268
269 // --- unit / percentage branching ---
270 match self.peek() {
271 Some('%') => {
272 self.bump();
273 Token::Dimension(value, "%".to_string())
274 }
275 Some(c) if is_ident_start(c) => {
276 let mut unit = String::new();
277 while let Some(c) = self.peek() {
278 if is_ident_continue(c) {
279 unit.push(c);
280 self.bump();
281 } else {
282 break;
283 }
284 }
285 Token::Dimension(value, unit)
286 }
287 _ => Token::Number(value),
288 }
289 }
290
291 /// Consume a CSS comment.
292 ///
293 /// Assumes the opening `/*` has already been consumed.
294 fn consume_comment(&mut self) -> Token {
295 let mut value = String::new();
296
297 while let Some(c) = self.peek() {
298 if c == '*' && self.peek_next() == Some('/') {
299 self.bump(); // consume '*'
300 self.bump(); // consume '/'
301 break;
302 } else {
303 value.push(c);
304 self.bump();
305 }
306 }
307
308 Token::Comment(value)
309 }
310
311 fn consume_escape(&mut self) -> Option<char> {
312 self.bump(); // consume '\'
313
314 // 1. Line continuation: backslash + newline => nothing
315 match self.peek() {
316 Some('\n') => {
317 self.bump();
318 return None;
319 }
320 Some('\r') => {
321 self.bump();
322 if self.peek() == Some('\n') {
323 self.bump(); // CRLF
324 }
325 return None;
326 }
327 _ => {}
328 }
329
330 // 2. Unicode escape
331 let mut hex = String::new();
332 for _ in 0..6 {
333 match self.peek() {
334 Some(c) if c.is_ascii_hexdigit() => {
335 hex.push(c);
336 self.bump();
337 }
338 _ => break,
339 }
340 }
341
342 if !hex.is_empty() {
343 if matches!(self.peek(), Some(c) if c.is_whitespace()) {
344 self.bump(); // optional whitespace
345 }
346
347 let code = u32::from_str_radix(&hex, 16).ok()?;
348 return std::char::from_u32(code).or(Some('\u{FFFD}'));
349 }
350
351 // 3. Simple escape
352 if let Some(c) = self.peek() {
353 self.bump();
354 Some(c)
355 } else {
356 None
357 }
358 }
359}
360
361/// Returns true if the character can start an identifier.
362///
363/// This is a simplified CSS identifier start check.
364/// It supports:
365/// - ASCII letters (A–Z, a–z)
366/// - underscore (`_`)
367/// - hyphen (`-`)
368/// - non-ASCII characters
369fn is_ident_start(c: char) -> bool {
370 c.is_ascii_alphabetic() || c == '\\' || c == '_' || c == '-' || !c.is_ascii()
371}
372
373/// Returns true if the character is a CSS string delimiter.
374///
375/// CSS strings are delimited by either double quotes (`"`)
376/// or single quotes (`'`).
377fn is_string_delimiter(c: char) -> bool {
378 matches!(c, '"' | '\'')
379}
380
381/// Returns true if the character can continue an identifier.
382///
383/// - ASCII letters (A–Z, a–z)
384/// - ASCII digits (0–9)
385/// - Underscore (`_`)
386/// - Hyphen (`-`)
387/// - Non-ASCII characters
388fn is_ident_continue(c: char) -> bool {
389 c.is_ascii_alphanumeric() || c == '_' || c == '-' || !c.is_ascii()
390}
391
392/// Returns true if the character is a CSS number start.
393///
394/// - ASCII digits (0-9)
395/// - A dot followed by a digit (e.g. `.5`)
396/// - A hyphen followed by a digit or dot (e.g. `-1`, `-.5`)
397fn is_number_start(current: char, next: Option<char>) -> bool {
398 current.is_ascii_digit()
399 || (current == '.' && matches!(next, Some(c) if c.is_ascii_digit()))
400 || (current == '-' && matches!(next, Some(c) if c.is_ascii_digit() || c == '.'))
401}