use token_iter::*; #[derive(Clone, Debug, PartialEq)] #[rustfmt::skip] pub enum Token { LT, LE, // < <= GT, GE, // > >= EQ, EqEq, // = == LParen, RParen, // ( ) LCurly, RCurly, // { } If, // if While, // while Ident(String), // e.g. foo Int(usize), // e.g. 273 BadInt(String), // e.g. 9873487239482398472498723423987234 Unrecognized(String), // e.g. @ } use Token::*; #[rustfmt::skip] fn tokenizer(lx: &mut Lexer) -> Option { let is_digit = |c| char::is_ascii_digit(&c); Some( match lx.skip_while(char::is_whitespace).next()? { '<' => if lx.at('=') {LE} else {LT}, '>' => if lx.at('=') {GE} else {GT}, '=' => if lx.at('=') {EqEq} else {EQ}, '(' => LParen, ')' => RParen, '{' => LCurly, '}' => RCurly, c if c.is_alphabetic() => match lx.take_while(char::is_alphanumeric).get() { "if" => If, "while" => While, s => Ident(s.into()), }, c if is_digit(c) => lx.take_while(is_digit).map( |it| if let Ok(n) = it.parse::() { Int(n) } else { BadInt(it.into()) } ), _ => Unrecognized(lx.into()) } ) } #[test] fn test_empty_str() { let next = tokens_in_line("", &tokenizer).next(); assert_eq!(next, None); } #[test] fn test_no_whitespace() { let line = "if(foo<=10){x=2}"; #[rustfmt::skip] let expected_tokens = vec![ If, LParen, Ident("foo".into()), LE, Int(10), RParen, LCurly, Ident("x".into()), EQ, Int(2), RCurly ]; let results: Vec<_> = tokens_in_line(line, &tokenizer).collect(); let tokens: Vec<_> = results.iter().map(|(_, token)| token.clone()).collect(); assert_eq!(tokens, expected_tokens); } #[test] fn test_lots_of_whitespace() { let line = " if ( foo <= 10 ) \n { x = 2 } "; #[rustfmt::skip] let expected_tokens = vec![ If, LParen, Ident("foo".into()), LE, Int(10), RParen, LCurly, Ident("x".into()), EQ, Int(2), RCurly ]; let results: Vec<_> = tokens_in_line(line, &tokenizer).collect(); let tokens: Vec<_> = results.iter().map(|(_, token)| token.clone()).collect(); assert_eq!(tokens, expected_tokens); } #[test] fn test_multiline() { let code = "\n\nif (foo<=10) {x=2}"; #[rustfmt::skip] let expected_tokens = vec![ If, LParen, Ident("foo".into()), LE, Int(10), RParen, LCurly, Ident("x".into()), EQ, Int(2), RCurly ]; let results: Vec<_> = tokens_in(code.lines(), &tokenizer).collect(); assert!(results.iter().all(|(line, _, _)| *line == 2)); let tokens: Vec<_> = results .iter() .map(|(_, _, token)| (*token).clone()) .collect(); assert_eq!(tokens, expected_tokens); } #[test] fn test_line_and_column() { let code = "{\n foo = 2\n}"; let expected_results = vec![ (0, 0..1, LCurly), (1, 4..7, Ident("foo".into())), (1, 8..9, EQ), (1, 10..11, Int(2)), (2, 0..1, RCurly), ]; let results: Vec<_> = tokens_in(code.lines(), &tokenizer).collect(); assert_eq!(results, expected_results); } #[test] fn test_line() { let code = "a b c d\nd e f"; fn tokenizer(lx: &mut Lexer) -> Option { lx.skip_while(|_| true); Some(Int(lx.line().len())) } let expected_results = vec![(0, 7..7, Int(7)), (1, 5..5, Int(5))]; let results: Vec<_> = tokens_in(code.lines(), &tokenizer).collect(); assert_eq!(results, expected_results); } #[test] fn test_multibyte_chars() { let line = "关于本网站的"; // Tokenizer calls ^ LT, and anything else an Ident. // So this should tokenize to Ident < Ident. fn tokenizer(lx: &mut Lexer) -> Option { Some(match lx.next()? { '本' => LT, _ => Ident(lx.take_while(|c| c != '本').into()), }) } let results: Vec<_> = tokens_in_line(line, &tokenizer).collect(); let expected_results = [ (0..2, Ident("关于".into())), (2..3, LT), (3..6, Ident("网站的".into())), ]; assert_eq!(results, expected_results); assert_eq!(line.len(), 18); }