//! This test just tokenizes zlib's deflate header file. use alkale::{ common::numeric::{ base::{NumericalBase, StandardBase}, IntegerParseError, ParseNumberResult, }, format_notification, map_double_char_tokens, map_single_char_token, notification::NotificationSeverity, span::{Span, Spanned}, token::Token, FinalizedLexerResult, LexerResult, SourceCodeScanner, }; use criterion::{criterion_group, Criterion}; #[expect(dead_code)] #[derive(Debug)] pub enum CToken<'a> { // Keywords Auto, Double, Int, Struct, Break, Else, Long, Switch, Case, Enum, Register, Typedef, Char, Extern, Return, Union, Const, Float, Short, Unsigned, Continue, For, Signed, Void, Default, Goto, Sizeof, Volatile, Do, If, Static, While, // Punctuation OpenParen, // ( CloseParen, // ) OpenBrace, // { CloseBrace, // } OpenBracket, // [ CloseBracket, // ] Comma, // , Semicolon, // ; Backslash, // \ Hashtag, // # // Math Operators Plus, // + Minus, // - DoublePlus, // ++ DoubleMinus, // -- PlusEqual, // += MinusEqual, // -= Asterisk, // * AsteriskEqual, // *= Slash, // / SlashEqual, // /= Percent, // % PercentEqual, // %= // Bitwise Operators DoubleGreaterThan, // >> DoubleGreaterEqual, // >>= DoubleLessThan, // << DoubleLessEqual, // <<= Pipe, // | PipeEqual, // |= Ampersand, // & AmpersandEqual, // &= Caret, // ^ CaretEqual, // ^= Tilde, // ~ // Boolean Operators GreaterThan, // > GreaterEqual, // >= LessThan, // < LessEqual, // <= DoubleEqual, // == ExclaimEqual, // != DoubleAmpersand, // && DoublePipe, // || Exclaim, // ! // Misc Operators Equal, // = Period, // . Question, // ? Colon, // : Arrow, // -> // Nonterminals Identifier(&'a str), IntValue(u64), FloatValue(f64), CharValue(char), StringValue(String), BooleanValue(bool), } /// Tokenizer that scans through the input and ignores everything. pub fn scan(program: &str) { let ctx = SourceCodeScanner::new(program); while ctx.has_next() { ctx.skip(); } } /// Tokenizer that creates a token for every character. Has comments. Ignores span. pub fn chars(program: &str) { let ctx = SourceCodeScanner::new(program); let mut lexer_result = LexerResult::<_, ()>::new(); while ctx.has_next() { let next = unsafe { ctx.next().unwrap_unchecked() }; if next == '/' { let after = ctx.next(); match after { Some('/') => { ctx.skip_line(); continue; } Some('*') => { loop { ctx.skip_until('*'); ctx.skip(); match ctx.next() { None => break, Some('/') => break, _ => (), } } continue; } _ => {} } } // SAFETY: Span is intentionally invalid for testing purposes. This is // technically UB but we never read the span so it's fine. lexer_result.push_token(Token::new(next, unsafe { Span::new_empty(0) })); } } /// Tokenizer that creates a token for every character. Has comments. pub fn chars_span(program: &str) { let ctx = SourceCodeScanner::new(program); let mut lexer_result = LexerResult::<_, ()>::new(); while ctx.has_next() { let next = unsafe { ctx.next_span().unwrap_unchecked() }; if next.data == '/' { let after = ctx.next(); match after { Some('/') => { ctx.skip_line(); continue; } Some('*') => { loop { ctx.skip_until('*'); ctx.skip(); match ctx.next() { None => break, Some('/') => break, _ => (), } } continue; } _ => {} } } lexer_result.push_token(next.into()); } } /// Tokenizer that only uses single-char tokens and ignores all else. Has comments. pub fn singles(program: &str) { let ctx = SourceCodeScanner::new(program); let mut lexer_result = LexerResult::<_, ()>::new(); while ctx.has_next() { use CToken::{ Ampersand, AmpersandEqual, Arrow, Asterisk, AsteriskEqual, Backslash, Caret, CaretEqual, CloseBrace, CloseBracket, CloseParen, Colon, Comma, DoubleAmpersand, DoubleEqual, DoubleMinus, DoublePipe, DoublePlus, Equal, Exclaim, ExclaimEqual, Hashtag, Minus, MinusEqual, OpenBrace, OpenBracket, OpenParen, Percent, PercentEqual, Period, Pipe, PipeEqual, Plus, PlusEqual, Question, Semicolon, Slash, SlashEqual, Tilde, }; // Parse out single char tokens, we can't do tokens like `+` because they may // be followed by a `=`, which makes them a different token. map_single_char_token!(&ctx, &mut lexer_result, '[' => OpenBracket, ']' => CloseBracket, '{' => OpenBrace, '}' => CloseBrace, '(' => OpenParen, ')' => CloseParen, ',' => Comma, ';' => Semicolon, '.' => Period, '~' => Tilde, '?' => Question, ':' => Colon, '#' => Hashtag, '\\' => Backslash, ); // Parse out one-to-two character tokens. map_double_char_tokens!(&ctx, &mut lexer_result, '+' => { '+' => DoublePlus, '=' => PlusEqual, _ => Plus, }, '-' => { '-' => DoubleMinus, '=' => MinusEqual, '>' => Arrow, _ => Minus, }, '*' => { '=' => AsteriskEqual, _ => Asterisk, }, '/' => { '/' => { ctx.skip_line(); continue; }, '*' => { loop { ctx.skip_until('*'); ctx.skip(); match ctx.next() { None => break, Some('/') => break, _ => () } } continue; }, '=' => SlashEqual, _ => Slash, }, '%' => { '=' => PercentEqual, _ => Percent, }, '|' => { '|' => DoublePipe, '=' => PipeEqual, _ => Pipe, }, '&' => { '&' => DoubleAmpersand, '=' => AmpersandEqual, _ => Ampersand, }, '^' => { '=' => CaretEqual, _ => Caret, }, '=' => { '=' => DoubleEqual, _ => Equal, }, '!' => { '=' => ExclaimEqual, _ => Exclaim, }, ); ctx.skip(); } } // We can assume no errors can occur, even if they do it's not a huge deal // for this benchmark. pub fn lexer(program: &'static str) -> FinalizedLexerResult> { let ctx = SourceCodeScanner::new(program); let mut lexer_result = LexerResult::::new(); while ctx.has_next() { use CToken::*; // Parse out single char tokens, we can't do tokens like `+` because they may // be followed by a `=`, which makes them a different token. map_single_char_token!(&ctx, &mut lexer_result, '[' => OpenBracket, ']' => CloseBracket, '{' => OpenBrace, '}' => CloseBrace, '(' => OpenParen, ')' => CloseParen, ',' => Comma, ';' => Semicolon, '.' => Period, '~' => Tilde, '?' => Question, ':' => Colon, '#' => Hashtag, '\\' => Backslash, ); // Parse out one-to-two character tokens. map_double_char_tokens!(&ctx, &mut lexer_result, '+' => { '+' => DoublePlus, '=' => PlusEqual, _ => Plus, }, '-' => { '-' => DoubleMinus, '=' => MinusEqual, '>' => Arrow, _ => Minus, }, '*' => { '=' => AsteriskEqual, _ => Asterisk, }, '/' => { '/' => { ctx.skip_line(); continue; }, '*' => { loop { ctx.skip_until('*'); ctx.skip(); match ctx.next() { None => break, Some('/') => break, _ => () } } continue; }, '=' => SlashEqual, _ => Slash, }, '%' => { '=' => PercentEqual, _ => Percent, }, '|' => { '|' => DoublePipe, '=' => PipeEqual, _ => Pipe, }, '&' => { '&' => DoubleAmpersand, '=' => AmpersandEqual, _ => Ampersand, }, '^' => { '=' => CaretEqual, _ => Caret, }, '=' => { '=' => DoubleEqual, _ => Equal, }, '!' => { '=' => ExclaimEqual, _ => Exclaim, }, ); // Special case: >, >=, >>, >>=. if ctx.peek() == Some('>') { let span = ctx.span(); ctx.skip(); let data = match ctx.peek() { Some('=') => { ctx.skip(); GreaterEqual } Some('>') => { ctx.skip(); if ctx.peek() == Some('=') { ctx.skip(); DoubleGreaterEqual } else { DoubleGreaterThan } } _ => GreaterThan, }; let final_span = ctx.span(); lexer_result.push_token(Token::new(data, span.up_to(&final_span))); continue; } // Special case: <, <=, <<, <<=. if ctx.peek() == Some('<') { let span = ctx.span(); ctx.skip(); let data = match ctx.peek() { Some('=') => { ctx.skip(); LessEqual } Some('<') => { ctx.skip(); if ctx.peek() == Some('=') { ctx.skip(); DoubleLessEqual } else { DoubleLessThan } } _ => LessThan, }; let final_span = ctx.span(); lexer_result.push_token(Token::new(data, span.up_to(&final_span))); continue; } // Parse out identifiers and keywords if let Some(Spanned { span, data }) = ctx.try_consume_standard_identifier() { let data = match data { "auto" => Auto, "double" => Double, "int" => Int, "struct" => Struct, "break" => Break, "else" => Else, "long" => Long, "switch" => Switch, "case" => Case, "enum" => Enum, "register" => Register, "typedef" => Typedef, "char" => Char, "extern" => Extern, "return" => Return, "union" => Union, "const" => Const, "float" => Float, "short" => Short, "unsigned" => Unsigned, "continue" => Continue, "for" => For, "signed" => Signed, "void" => Void, "default" => Default, "goto" => Goto, "sizeof" => Sizeof, "volatile" => Volatile, "do" => Do, "if" => If, "static" => Static, "while" => While, _ => Identifier(data), }; lexer_result.push_token(Token::new(data, span)); continue; } if let Some(Spanned { span, data: result }) = parse_c_number(&ctx) { match result { ParseNumberResult::Integer(Ok(number)) => { lexer_result.push_token(Token::new(IntValue(number), span)); } ParseNumberResult::Float(Ok(number)) => { lexer_result.push_token(Token::new(FloatValue(number), span)); } ParseNumberResult::Integer(Err(err)) => { format_notification!("Error creating number token: {:?}", err) .span(span) .severity(NotificationSeverity::Error) .report(&mut lexer_result); } ParseNumberResult::Float(Err(err)) => { format_notification!("Error creating number token: {:?}", err) .span(span) .severity(NotificationSeverity::Error) .report(&mut lexer_result); } } continue; } // Chars if let Some(Spanned { span, data: result }) = ctx.try_parse_character_token() { match result { Ok(char) => { lexer_result.push_token(Token::new(CharValue(char), span)); } Err(err) => { format_notification!("Error creating character: {:?}", err) .span(span) .severity(NotificationSeverity::Error) .report(&mut lexer_result); } } continue; } // Strings if let Some(Spanned { span, data: result }) = ctx.try_parse_strict_string() { match result { Ok(str) => { lexer_result.push_token(Token::new(StringValue(str), span)); } Err(errs) => { for err in errs { format_notification!("Error creating string: {:?}", err) .span(span) .severity(NotificationSeverity::Error) .report(&mut lexer_result); } } } continue; } if let Some(last) = ctx.next_span() { if !last.is_whitespace() { format_notification!("Unexpected character '{}'", last.data) .span(last.span) .severity(NotificationSeverity::Error) .report(&mut lexer_result); } } } lexer_result.finalize() } fn parse_c_number(ctx: &SourceCodeScanner) -> Option>> { let Spanned { span, data } = ctx.consume_standard_number()?; let res = if data.starts_with("0x") { // 0xFF22 let num = unsafe { data.get_unchecked(2..) }; let parsed = parse_integer_from_base(num, &StandardBase::Hexadecimal); ParseNumberResult::Integer(parsed) } else if data.starts_with("0b") { // 0b1101010 let num = unsafe { data.get_unchecked(2..) }; let parsed = parse_integer_from_base(num, &StandardBase::Binary); ParseNumberResult::Integer(parsed) } else if data.ends_with('L') { // 124L let num = unsafe { data.get_unchecked(..data.len() - 1) }; let parsed = parse_integer_from_base(num, &StandardBase::Decimal); ParseNumberResult::Integer(parsed) } else if data.ends_with('f') { // 12f let num = unsafe { data.get_unchecked(..data.len() - 1) }; let stripped = num.replace('_', ""); let parsed = stripped.parse::(); ParseNumberResult::Float(parsed) } else if data.contains(['.', 'e', 'E', '+', '-']) { // 0.5, 2e+5, etc let stripped = data.replace('_', ""); let parsed = stripped.parse::(); ParseNumberResult::Float(parsed) } else if data.starts_with('0') && data.len() > 1 { let num = unsafe { data.get_unchecked(1..) }; let parsed = parse_integer_from_base(num, &StandardBase::Octal); ParseNumberResult::Integer(parsed) } else { let parsed = parse_integer_from_base(data, &StandardBase::Decimal); ParseNumberResult::Integer(parsed) }; Some(span.wrap(res)) } fn parse_integer_from_base(s: &str, base: &B) -> Result { let mut out = 0_u64; let position = u64::from(base.position_value()); for next in s.chars() { if base.includes(next) { // digit * sign should never overflow let value = u64::from(unsafe { base.value_of_unchecked(next) }); // out = Some(out * position + value); out = IntegerOutOfRangeError if anything goes wrong let Some(new_value) = out.checked_mul(position).and_then(|y| y.checked_add(value)) else { return Err(IntegerParseError::OutOfRange); }; out = new_value; } else if next != '_' { return Err(IntegerParseError::InvalidCharacter(next)); } } Ok(out) } const CODE: &str = include_str!("deflate.h"); fn bench(criterion: &mut Criterion) { criterion.bench_function("deflate", |x| x.iter(|| lexer(CODE))); criterion.bench_function("deflate_scan", |x| x.iter(|| scan(CODE))); criterion.bench_function("deflate_chars", |x| x.iter(|| chars(CODE))); criterion.bench_function("deflate_chars_span", |x| x.iter(|| chars_span(CODE))); criterion.bench_function("deflate_singles", |x| x.iter(|| singles(CODE))); } criterion_group!(benches, bench);