// // tests/lexer.rs // The PHiLe Compiler // // Created by Arpad Goretity (H2CO3) // on 31/07/2017 // #![cfg(test)] #![deny(missing_debug_implementations, missing_copy_implementations, trivial_casts, trivial_numeric_casts, unsafe_code, unstable_features, unused_import_braces, unused_qualifications)] #![cfg_attr(feature = "cargo-clippy", allow(match_same_arms, clone_on_ref_ptr))] #![cfg_attr(feature = "cargo-clippy", deny(wrong_pub_self_convention, used_underscore_binding, stutter, similar_names, pub_enum_variant_names, // TODO(H2CO3): locally allowing non_ascii_literal doesn't work /* non_ascii_literal, */ unicode_not_nfc, /* result_unwrap_used, option_unwrap_used, */ // TODO(H2CO3): fix these option_map_unwrap_or_else, option_map_unwrap_or, filter_map, shadow_unrelated, shadow_reuse, shadow_same, int_plus_one, string_add_assign, if_not_else, invalid_upcast_comparisons, cast_sign_loss, cast_precision_loss, cast_possible_wrap, cast_possible_truncation, mutex_integer, mut_mut, items_after_statements, print_stdout, mem_forget, maybe_infinite_iter))] #[macro_use] extern crate quickcheck; #[macro_use] extern crate lazy_static; extern crate unicode_xid; extern crate unicode_segmentation; extern crate regex; extern crate rand; extern crate phile; use std::char; use std::str; use std::vec; use std::ops; use std::iter; use std::ascii::AsciiExt; use std::fmt::Write; use quickcheck::{ Arbitrary, Gen }; use unicode_xid::UnicodeXID; use unicode_segmentation::UnicodeSegmentation; use regex::Regex; use phile::error::Error; use phile::lexer; use phile::util::grapheme_count; // // Types for generating correct and incorrect source code // // Append a lexeme onto `source`'s buffer trait Lexeme: Arbitrary { fn render(&self, buf: &mut String, end: &mut lexer::Location) -> lexer::TokenKind; } // Metadata about a token, without the actual underlying lexeme #[derive(Debug, Clone)] struct TokenHeader { slice: ops::Range, range: lexer::Range, kind: lexer::TokenKind, } type ValidSourceFile = Vec; // Represents some well-formed multi-file input to the lexer #[derive(Debug, Clone, PartialEq, Eq)] struct ValidSource { files: Vec, } impl ValidSource { fn render(&self) -> (Vec, Vec) { let mut strings = Vec::with_capacity(self.files.len()); let mut tokens = Vec::with_capacity(self.files.len() * 100); for (i, file) in self.files.iter().enumerate() { let mut buf = String::with_capacity(file.len() * 16); let mut start = lexer::Location { src_idx: i, line: 1, column: 1 }; for token in file { let mut end = start; let old_len = buf.len(); let kind = token.render(&mut buf, &mut end); let new_len = buf.len(); let slice = old_len..new_len; let range = lexer::Range { start, end }; tokens.push(TokenHeader { slice, range, kind }); start = end; } strings.push(buf); } (strings, tokens) } } impl Arbitrary for ValidSource { fn arbitrary(g: &mut G) -> Self { // Generate a random number of source "files", but at least 1 // Fill each file with a random number of correct lexemes, but at least 1 let files = (0..g.gen_range(1, 8)).map(|_| { let gen_size = g.size() / 8; (0..g.gen_range(1, gen_size)).flat_map(|_| { let ws_gens: &[fn(&mut G) -> ValidToken] = &[ |g| ValidToken::Whitespace(ValidWhitespace::arbitrary(g)), |g| ValidToken::LineComment(ValidLineComment::arbitrary(g)), ]; let non_ws_gens: &[fn(&mut G) -> ValidToken] = &[ |g| ValidToken::Word(ValidWord::arbitrary(g)), |g| ValidToken::Number(ValidNumber::arbitrary(g)), |g| ValidToken::Punct(ValidPunct::arbitrary(g)), |g| ValidToken::String(ValidString::arbitrary(g)), ]; let ws_gen = *g.choose(ws_gens).unwrap(); let non_ws_gen = *g.choose(non_ws_gens).unwrap(); iter::once(ws_gen(g)).chain(iter::once(non_ws_gen(g))) }).collect() }).collect(); ValidSource { files } } // First, check which source file the bug is in, // then try to shrink that specific file. // Token/lexeme sequences generated by Arbitrary always // contain a whitespace or a line comment, followed by // some other kind of token/lexeme. Therefore, we can // reliably shrink the sequence by picking pairs of // tokens starting on even indices. For each such pair, // generate the Cartesian product of the two lists of // shrunk lexemes, one for the whitespace, one for the // following non-whitespace. fn shrink(&self) -> Box> { let it = self.files.iter().flat_map(|tokens| { tokens.chunks(2).flat_map(move |chunk| { chunk[0].shrink().flat_map(move |ws| { chunk[1].shrink().flat_map(move |non_ws| { iter::once(ValidSource { files: vec![vec![ws.clone(), non_ws.clone()]] }) }) }) }) }); Box::new(it.collect::>().into_iter()) } } // Poor man's dynamic dispatch for non-object-safe traits: enums! #[derive(Debug, Clone, PartialEq, Eq)] enum ValidToken { Whitespace(ValidWhitespace), LineComment(ValidLineComment), Word(ValidWord), Number(ValidNumber), Punct(ValidPunct), String(ValidString), } impl Lexeme for ValidToken { fn render(&self, buf: &mut String, end: &mut lexer::Location) -> lexer::TokenKind { use ValidToken::*; match *self { Whitespace(ref ws) => ws.render(buf, end), LineComment(ref lc) => lc.render(buf, end), Word(ref word) => word.render(buf, end), Number(ref num) => num.render(buf, end), Punct(ref punct) => punct.render(buf, end), String(ref s) => s.render(buf, end), } } } impl Arbitrary for ValidToken { fn arbitrary(g: &mut G) -> Self { use ValidToken::*; let token_gens: &[fn(&mut G) -> Self] = &[ |g| Whitespace(ValidWhitespace::arbitrary(g)), |g| LineComment(ValidLineComment::arbitrary(g)), |g| Word(ValidWord::arbitrary(g)), |g| Number(ValidNumber::arbitrary(g)), |g| String(ValidString::arbitrary(g)), |g| Punct(ValidPunct::arbitrary(g)), ]; let gen = *g.choose(token_gens).unwrap(); gen(g) } fn shrink(&self) -> Box> { use ValidToken::*; match *self { Whitespace(ref ws) => Box::new(ws.shrink().map(Whitespace)), LineComment(ref lc) => Box::new(lc.shrink().map(LineComment)), Word(ref word) => Box::new(word.shrink().map(Word)), Number(ref num) => Box::new(num.shrink().map(Number)), Punct(ref punct) => Box::new(punct.shrink().map(Punct)), String(ref s) => Box::new(s.shrink().map(String)), } } } // // A lexeme containing well-formed whitespace characters, // horizontal as well as vertical. // #[derive(Debug, Clone, PartialEq, Eq)] struct ValidWhitespace { buf: String, } impl Arbitrary for ValidWhitespace { fn arbitrary(g: &mut G) -> Self { // generate at least one valid whitespace character let buf = (0..g.gen_range(1, 16)).map(|_| { let chars = if g.gen() { HOR_WS } else { VER_WS }; *g.choose(chars).unwrap() }).collect(); ValidWhitespace { buf } } fn shrink(&self) -> Box> { let num_chars = self.buf.chars().count(); // do not return empty string as valid whitespace if num_chars <= 1 { return quickcheck::empty_shrinker() } // leave out each character, one-by-one let iter = (0..num_chars).map(|i| { let buf = self.buf .chars() .enumerate() .filter_map(|(j, c)| if j == i { None } else { Some(c) }) .collect(); ValidWhitespace { buf } }); Box::new(iter.collect::>().into_iter()) } } impl Lexeme for ValidWhitespace { fn render(&self, buf: &mut String, end: &mut lexer::Location) -> lexer::TokenKind { for g in self.buf.graphemes(true) { if g.contains(VER_WS) { end.column = 1; end.line += 1; } else { end.column += 1; } } *buf += &self.buf; lexer::TokenKind::Whitespace } } // // A lexeme representing a line comment ending with a newline. // #[derive(Debug, Clone, PartialEq, Eq)] struct ValidLineComment { buf: String, } impl Arbitrary for ValidLineComment { fn arbitrary(g: &mut G) -> Self { // Generate 0 or more valid non-newline extended grapheme clusters. // Generating the newline character has to be done first because the // map iterator mutably borrows the PRNG for the rest of the fn body. let newline = *g.choose(VER_WS).unwrap(); let it = (0..g.gen_range(0, 16)).map(|_| { loop { let ch: char = g.gen(); if !is_ver_ws(ch) { break ch } } }); let buf = iter::once('#').chain(it).chain(iter::once(newline)).collect(); ValidLineComment { buf } } fn shrink(&self) -> Box> { // leave out each character, one-by-one, // except the leading '#' and the trailing newline let it = (1..self.buf.chars().count() - 1).map(|i| { let buf: String = self.buf .chars() .enumerate() .filter_map(|(j, c)| if j == i { None } else { Some(c) }) .collect(); assert!(buf.starts_with('#')); assert!(is_ver_ws(buf.chars().last().unwrap())); ValidLineComment { buf } }); Box::new(it.collect::>().into_iter()) } } impl Lexeme for ValidLineComment { fn render(&self, buf: &mut String, end: &mut lexer::Location) -> lexer::TokenKind { // A line comment represented by this type always ends in a newline. assert!(self.buf.starts_with('#')); assert!(is_ver_ws(self.buf.chars().last().unwrap())); end.line += 1; end.column = 1; *buf += &self.buf; lexer::TokenKind::Comment } } // // A lexeme representing words, that is, identifiers and keywords // #[derive(Debug, Clone, PartialEq, Eq)] struct ValidWord { buf: String, } impl Arbitrary for ValidWord { fn arbitrary(g: &mut G) -> Self { // Generate exactly one valid XID_Start char, // followed by any number of valid XID_Continue ones. let first = loop { let ch: char = g.gen(); if is_ident_start(ch) { break iter::once(ch) } }; let rest = (0..g.gen_range(0, 16)).map(|_| { loop { let ch: char = g.gen(); if is_ident_cont(ch) { break ch } } }); ValidWord { buf: first.chain(rest).collect() } } fn shrink(&self) -> Box> { // The first char always has to be a XID_Start, so don't touch it. // Leave out the rest of the chars, one by one. let it = (1..self.buf.chars().count()).map(|i| { let buf: String = self.buf .chars() .enumerate() .filter_map(|(j, c)| if j == i { None } else { Some(c) }) .collect(); assert!(is_ident_start(buf.chars().next().unwrap())); ValidWord { buf } }); Box::new(it.collect::>().into_iter()) } } impl Lexeme for ValidWord { fn render(&self, buf: &mut String, end: &mut lexer::Location) -> lexer::TokenKind { // Identifiers must not contain whitespace, including newlines assert!(!self.buf.contains(char::is_whitespace)); end.column += grapheme_count(&self.buf); // because we contain no newlines *buf += &self.buf; lexer::TokenKind::Word } } // // A type representing valid numeric (integer and floating-point) literals // #[derive(Debug, Clone, PartialEq, Eq)] struct ValidNumber { buf: String, kind: ValidNumberKind, prefix_len: usize, // this many leading bytes to be preserved by shrink() } #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum ValidNumberKind { DecimalInt, BinaryInt, OctalInt, HexInt, FloatingPoint, } impl ValidNumber { fn arbitrary_int(g: &mut G, prefix: &str, digits: &[char], kind: ValidNumberKind) -> Self { let range = 0..g.gen_range(1, 16); let digits_iter = range.map(|_| *g.choose(digits).unwrap()); let buf = prefix.chars().chain(digits_iter).collect(); let prefix_len = prefix.len(); ValidNumber { buf, kind, prefix_len } } fn arbitrary_decimal_int(g: &mut G) -> Self { Self::arbitrary_int( g, "", &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], ValidNumberKind::DecimalInt, ) } fn arbitrary_binary_int(g: &mut G) -> Self { let prefix = if g.gen() { "0b" } else { "0B" }; Self::arbitrary_int( g, prefix, &['0', '1'], ValidNumberKind::BinaryInt, ) } fn arbitrary_octal_int(g: &mut G) -> Self { let prefix = if g.gen() { "0o" } else { "0O" }; Self::arbitrary_int( g, prefix, &['0', '1', '2', '3', '4', '5', '6', '7'], ValidNumberKind::OctalInt, ) } fn arbitrary_hex_int(g: &mut G) -> Self { let prefix = if g.gen() { "0x" } else { "0X" }; Self::arbitrary_int( g, prefix, &[ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C', 'D', 'E', 'F', ], ValidNumberKind::HexInt, ) } fn decimal_digits(g: &mut G) -> vec::IntoIter { let digits = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']; (0..g.gen_range(1, 16)) .map(|_| *g.choose(&digits).unwrap()) .collect::>() .into_iter() } fn arbitrary_float(g: &mut G) -> Self { let integral = Self::decimal_digits(g); let fraction = Self::decimal_digits(g); let dot = iter::once('.'); let exponent: Box> = if g.gen() { let e = iter::once(if g.gen() { 'e' } else { 'E' }); let ds = Self::decimal_digits(g); let sign: Box> = if g.gen() { Box::new(iter::once(if g.gen() { '+' } else { '-' })) } else { Box::new(iter::empty()) }; Box::new(e.chain(sign).chain(ds)) } else { Box::new(iter::empty()) }; let buf = integral.chain(dot).chain(fraction).chain(exponent).collect(); let kind = ValidNumberKind::FloatingPoint; let prefix_len = 0; ValidNumber { buf, kind, prefix_len } } fn shrink_int(&self) -> Box> { assert_ne!(self.kind, ValidNumberKind::FloatingPoint); assert!(self.prefix_len < self.buf.len()); let prefix = &self.buf[..self.prefix_len]; let payload = &self.buf[self.prefix_len..]; // do not return empty string as a valid integer literal if payload.chars().count() <= 1 { return quickcheck::empty_shrinker() } // Remove digits one-by-one let it = (0..payload.chars().count()).map(|i| { let digits = payload .chars() .enumerate() .filter_map(|(j, c)| if j == i { None } else { Some(c) }); ValidNumber { buf: prefix.chars().chain(digits).collect(), kind: self.kind, prefix_len: self.prefix_len, } }); Box::new(it.collect::>().into_iter()) } } impl Arbitrary for ValidNumber { fn arbitrary(g: &mut G) -> Self { let funcs: &[fn(&mut G) -> Self] = &[ Self::arbitrary_decimal_int, Self::arbitrary_binary_int, Self::arbitrary_octal_int, Self::arbitrary_hex_int, Self::arbitrary_float, ]; let func = g.choose(funcs).unwrap(); func(g) } fn shrink(&self) -> Box> { use ValidNumberKind::*; match self.kind { DecimalInt | BinaryInt | OctalInt | HexInt => self.shrink_int(), FloatingPoint => quickcheck::empty_shrinker(), // too hard } } } impl Lexeme for ValidNumber { fn render(&self, buf: &mut String, end: &mut lexer::Location) -> lexer::TokenKind { // Identifiers must not contain whitespace, including newlines assert!(!self.buf.contains(char::is_whitespace)); end.column += grapheme_count(&self.buf); // because we contain no newlines *buf += &self.buf; lexer::TokenKind::Numeric } } // // Type for generating an operator or other punctuation character // #[derive(Debug, Clone, PartialEq, Eq)] struct ValidPunct { value: &'static str, } // No shrink(); operators are atomic impl Arbitrary for ValidPunct { fn arbitrary(g: &mut G) -> Self { ValidPunct { value: *g.choose(PUNCTUATION).unwrap() } } } impl Lexeme for ValidPunct { fn render(&self, buf: &mut String, end: &mut lexer::Location) -> lexer::TokenKind { assert!(!self.value.contains(char::is_whitespace)); assert!(self.value.is_ascii()); end.column += grapheme_count(self.value); *buf += self.value; lexer::TokenKind::Punctuation } } // // Type for generating valid string literals // #[derive(Debug, Clone, PartialEq, Eq)] struct ValidString { chars: Vec, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum CharacterLiteral { Unescaped(char), Quote, Backslash, Cr, Lf, Tab, Hex(u8), Unicode(char), } impl Arbitrary for ValidString { fn arbitrary(g: &mut G) -> Self { use CharacterLiteral::*; // generate a possibly empty string, 25% of which are escape sequences let it = (0..g.gen_range(0, 16)).map(|_| { if g.gen::() < 0x40 { let char_gens: &[fn(&mut G) -> CharacterLiteral] = &[ |_| Quote, |_| Backslash, |_| Cr, |_| Lf, |_| Tab, |g| Hex(g.gen()), |g| Unicode(g.gen()), ]; let char_gen = *g.choose(char_gens).unwrap(); return char_gen(g); } // Otherwise, generate any valid Unicode scalar except " and \ loop { match g.gen::() { '"' | '\\' => {}, ch => break Unescaped(ch), } } }); ValidString { chars: it.collect() } } fn shrink(&self) -> Box> { let it = (0..self.chars.len()).map(|i| { let chars = self.chars .iter() .enumerate() .filter_map(|(j, &c)| if j == i { None } else { Some(c) }) .collect(); ValidString { chars } }); Box::new(it.collect::>().into_iter()) } } impl Lexeme for ValidString { fn render(&self, buf: &mut String, end: &mut lexer::Location) -> lexer::TokenKind { use CharacterLiteral::*; let old_len = buf.len(); buf.reserve(self.chars.len() * 2); buf.push('"'); for ch in &self.chars { match *ch { Unescaped(uch) => buf.push(uch), Backslash => *buf += r#"\\"#, Quote => *buf += r#"\""#, Cr => *buf += r#"\r"#, Lf => *buf += r#"\n"#, Tab => *buf += r#"\t"#, Hex(byte) => write!(buf, "\\x{:02x}", byte).unwrap(), Unicode(uch) => write!(buf, "\\u{{{:x}}}", uch as u32).unwrap(), } } buf.push('"'); for g in buf[old_len..].graphemes(true) { // if one char is vertical whitespace, then all of them must be so if g.contains(VER_WS) { assert!(g.chars().all(|ch| VER_WS.contains(&ch))); end.line += 1; end.column = 1; } else { end.column += 1; } } lexer::TokenKind::String } } // TODO(H2CO3): test invalid lexemes (e.g. unterminated string // literals, string literals with invalid escapes or disallowed // [e.g. deprecated, surrogate, ...] characters, lonely integer // base prefixes without following digits [0b, 0x, etc.], // floating-point literals with missing parts [e.g. .5, 1.]) // using QuickCheck. (Currently there are only manually-written // unit tests.) // // Constants and functions for generating pseudorandom strings // // Horizontal whitespace static HOR_WS: &[char] = &[ '\u{0009}', '\u{0020}', '\u{00A0}', '\u{1680}', '\u{2000}', '\u{2001}', '\u{2002}', '\u{2003}', '\u{2004}', '\u{2005}', '\u{2006}', '\u{2007}', '\u{2008}', '\u{2009}', '\u{200A}', '\u{202F}', '\u{205F}', '\u{3000}', ]; // Vertical whitespace, a.k.a. newline characters static VER_WS: &[char] = &[ '\u{000A}', '\u{000B}', '\u{000C}', '\u{000D}', '\u{0085}', '\u{2028}', '\u{2029}', ]; // Valid operators static PUNCTUATION: &[&str] = &[ "<->", "<->!", "<->?", "<->*", "<->+", "!<->", "!<->!", "!<->?", "!<->*", "!<->+", "?<->", "?<->!", "?<->?", "?<->*", "?<->+", "*<->", "*<->!", "*<->?", "*<->*", "*<->+", "+<->", "+<->!", "+<->?", "+<->*", "+<->+", "...", "..", ".", "<=", ">=", "<", ">", "==", "!=", "->", "=>", "=", "::", ":", "(", ")", "[", "]", "{", "}", "+", "-", "*", "/", "%", "&", "|", "~", "?", ",", ";", ]; const CHAR_MAX_BYTES: usize = 4; fn is_hor_ws(ch: char) -> bool { HOR_WS.contains(&ch) } fn is_ver_ws(ch: char) -> bool { VER_WS.contains(&ch) } // Regex uses an outdated (Unicode 7.0) definition of XID_{Start,Continue}, // whereas the UnicodeXID crate conforms to Unicode 9.0. This makes // unit tests fail for some code points, e.g. U+17828. // See https://github.com/rust-lang/regex/issues/391. // TODO(H2CO3): this is both horribly slow, absolutely disgusting, // and doesn't actually test _anything_, because the same regex that // is used for lexing is also utilized in generating test cases. // Consequently, we must rewrite these two functions using UnicodeXID // once the regex crate is updated to a more recent version of Unicode... // When that is done, we can do away with the regex and lazy_static // imports in the test crate, and with the lazy_static dev dependency. #[allow(non_upper_case_globals)] fn is_ident_start(ch: char) -> bool { // UnicodeXID::is_xid_start(ch) || ch == '_' lazy_static! { static ref re: Regex = Regex::new(r"^[_\p{XID_Start}]$").unwrap(); } re.is_match(ch.encode_utf8(&mut [0; CHAR_MAX_BYTES])) } #[allow(non_upper_case_globals)] fn is_ident_cont(ch: char) -> bool { // UnicodeXID::is_xid_continue(ch) lazy_static! { static ref re: Regex = Regex::new(r"^\p{XID_Continue}$").unwrap(); } re.is_match(ch.encode_utf8(&mut [0; CHAR_MAX_BYTES])) } fn shrunk_transitive_closure(it: I) -> vec::IntoIter where T: Arbitrary, I: Iterator { let result = it.flat_map(|item| { let shrunk_items = item.shrink(); iter::once(item).chain(shrunk_transitive_closure(shrunk_items)) }); result.collect::>().into_iter() } // // Actual Unit Tests // #[test] fn shrink_valid_whitespace_items() { let ws = ValidWhitespace { buf: " \r\n".to_owned() }; let actual: Vec<_> = shrunk_transitive_closure(iter::once(ws)) .map(|lexeme| lexeme.buf) .collect(); let expected = [ " \r\n", "\r\n", "\n", "\r", " \n", "\n", " ", " \r", "\r", " ", ]; assert_eq!(actual, expected); } #[test] fn shrink_valid_whitespace_size() { fn num_shrunk(len: usize) -> usize { match len { 0 => panic!("Valid whitespace must be at least one char long"), 1 => 1, _ => 1 + len * num_shrunk(len - 1), } } let mut g = quickcheck::StdGen::new(rand::OsRng::new().unwrap(), 100); for _ in 0..100 { let ws = ValidWhitespace::arbitrary(&mut g); let num_chars = ws.buf.chars().count(); // this blows up exponentially, so only check for length < 6 if num_chars >= 6 { continue } let num_actual = shrunk_transitive_closure(iter::once(ws.clone())).count(); let num_expected = num_shrunk(num_chars); assert_eq!(num_actual, num_expected, "{:?}", ws); } } #[test] fn shrink_valid_line_comment_items() { let comment = ValidLineComment { buf: "#XYZ\n".to_owned(), }; let actual: Vec<_> = shrunk_transitive_closure(iter::once(comment)) .map(|lexeme| lexeme.buf) .collect(); let expected = [ "#XYZ\n", "#YZ\n", "#Z\n", "#\n", "#Y\n", "#\n", "#XZ\n", "#Z\n", "#\n", "#X\n", "#\n", "#XY\n", "#Y\n", "#\n", "#X\n", "#\n", ]; assert_eq!(actual, expected); } #[test] fn shrink_valid_line_comment_size() { fn num_shrunk(len: usize) -> usize { match len { 0 | 1 => panic!("Valid line comment must be at least 2 chars long (#\n)"), 2 => 1, _ => 1 + (len - 2) * num_shrunk(len - 1), } } let mut g = quickcheck::StdGen::new(rand::OsRng::new().unwrap(), 100); for _ in 0..100 { let comment = ValidLineComment::arbitrary(&mut g); let num_chars = comment.buf.chars().count(); // this blows up exponentially, so only check for length < 8 if num_chars >= 8 { continue } let num_actual = shrunk_transitive_closure(iter::once(comment.clone())).count(); let num_expected = num_shrunk(num_chars); assert_eq!(num_actual, num_expected, "{:?}", comment); } } #[test] fn shrink_valid_word_items() { let word = ValidWord { buf: "a1b2".to_owned(), }; let actual: Vec<_> = shrunk_transitive_closure(iter::once(word)) .map(|lexeme| lexeme.buf) .collect(); let expected = [ "a1b2", "ab2", "a2", "a", "ab", "a", "a12", "a2", "a", "a1", "a", "a1b", "ab", "a", "a1", "a", ]; assert_eq!(actual, expected); } #[test] fn shrink_valid_word_size() { fn num_shrunk(len: usize) -> usize { match len { 0 => panic!("Valid word must be at least one char long"), 1 => 1, _ => 1 + (len - 1) * num_shrunk(len - 1), } } let mut g = quickcheck::StdGen::new(rand::OsRng::new().unwrap(), 100); for _ in 0..100 { let word = ValidWord::arbitrary(&mut g); let num_chars = word.buf.chars().count(); // this blows up exponentially, so only check for length < 6 if num_chars >= 6 { continue } let num_actual = shrunk_transitive_closure(iter::once(word.clone())).count(); let num_expected = num_shrunk(num_chars); assert_eq!(num_actual, num_expected, "{:?}", word); } } #[test] fn shrink_valid_number_items() { let numbers = vec![ ( ValidNumber { buf: "908".to_owned(), kind: ValidNumberKind::DecimalInt, prefix_len: 0, }, vec![ "908", "08", "8", "0", "98", "8", "9", "90", "0", "9", ], ), ( ValidNumber { buf: "0b110".to_owned(), kind: ValidNumberKind::BinaryInt, prefix_len: 2, }, vec![ "0b110", "0b10", "0b0", "0b1", "0b10", "0b0", "0b1", "0b11", "0b1", "0b1", ], ), ( ValidNumber { buf: "0o704".to_owned(), kind: ValidNumberKind::OctalInt, prefix_len: 2, }, vec![ "0o704", "0o04", "0o4", "0o0", "0o74", "0o4", "0o7", "0o70", "0o0", "0o7", ], ), ( ValidNumber { buf: "0x30f".to_owned(), kind: ValidNumberKind::HexInt, prefix_len: 2, }, vec![ "0x30f", "0x0f", "0xf", "0x0", "0x3f", "0xf", "0x3", "0x30", "0x0", "0x3", ], ), ( ValidNumber { buf: "123.456e+78".to_owned(), kind: ValidNumberKind::FloatingPoint, prefix_len: 0, }, vec!["123.456e+78"], ), ]; for (number, expected) in numbers { let actual: Vec<_> = shrunk_transitive_closure(iter::once(number)) .map(|lexeme| lexeme.buf) .collect(); assert_eq!(actual, expected); } } #[test] fn shrink_valid_punctuation() { for &value in PUNCTUATION { let it = iter::once(ValidPunct { value }); let items: Vec<_> = shrunk_transitive_closure(it) .map(|lexeme| lexeme.value) .collect(); assert_eq!(items, [value]); } } #[test] fn shrink_valid_string_items() { use CharacterLiteral::*; let s = ValidString { chars: vec![Quote, Unicode('\u{0170}'), Unescaped('@')] }; let actual: Vec<_> = shrunk_transitive_closure(iter::once(s)) .map(|lexeme| lexeme.chars) .collect(); let expected = [ vec![Quote, Unicode('\u{0170}'), Unescaped('@')], vec![Unicode('\u{0170}'), Unescaped('@')], vec![Unescaped('@')], vec![], vec![Unicode('\u{0170}')], vec![], vec![Quote, Unescaped('@')], vec![Unescaped('@')], vec![], vec![Quote], vec![], vec![Quote, Unicode('\u{0170}')], vec![Unicode('\u{0170}')], vec![], vec![Quote], vec![], ]; assert_eq!(actual, expected); } #[test] fn shrink_valid_string_size() { fn num_shrunk(len: usize) -> usize { match len { 0 => 1, _ => 1 + len * num_shrunk(len - 1), } } let mut g = quickcheck::StdGen::new(rand::OsRng::new().unwrap(), 100); for _ in 0..100 { let s = ValidString::arbitrary(&mut g); let num_chars = s.chars.len(); // this blows up exponentially, so only check for length < 6 if num_chars >= 6 { continue } let num_actual = shrunk_transitive_closure(iter::once(s.clone())).count(); let num_expected = num_shrunk(num_chars); assert_eq!(num_actual, num_expected, "{:#?}", s); } } #[test] fn shrink_valid_source() { let source = ValidSource { files: vec![ vec![ ValidToken::Whitespace(ValidWhitespace { buf: " \r".to_owned() }), ValidToken::Number( ValidNumber { kind: ValidNumberKind::DecimalInt, buf: "10".to_owned(), prefix_len: 0 } ), ValidToken::LineComment(ValidLineComment { buf: "#a\n".to_owned() }), ValidToken::Word(ValidWord { buf: "XYZ".to_owned() }), ], ] }; let expected = vec![ // Results of shrinking the first two of lexemes ValidSource { files: vec![ vec![ ValidToken::Whitespace(ValidWhitespace { buf: "\r".to_owned() }), ValidToken::Number( ValidNumber { kind: ValidNumberKind::DecimalInt, buf: "0".to_owned(), prefix_len: 0 } ), ], ], }, ValidSource { files: vec![ vec![ ValidToken::Whitespace(ValidWhitespace { buf: "\r".to_owned() }), ValidToken::Number( ValidNumber { kind: ValidNumberKind::DecimalInt, buf: "1".to_owned(), prefix_len: 0 } ), ], ], }, ValidSource { files: vec![ vec![ ValidToken::Whitespace(ValidWhitespace { buf: " ".to_owned() }), ValidToken::Number( ValidNumber { kind: ValidNumberKind::DecimalInt, buf: "0".to_owned(), prefix_len: 0 } ), ], ], }, ValidSource { files: vec![ vec![ ValidToken::Whitespace(ValidWhitespace { buf: " ".to_owned() }), ValidToken::Number( ValidNumber { kind: ValidNumberKind::DecimalInt, buf: "1".to_owned(), prefix_len: 0 } ), ], ], }, // Results of shrinking the second pair of lexemes ValidSource { files: vec![ vec![ ValidToken::LineComment(ValidLineComment { buf: "#\n".to_owned() }), ValidToken::Word(ValidWord { buf: "XZ".to_owned() }), ], ], }, ValidSource { files: vec![ vec![ ValidToken::LineComment(ValidLineComment { buf: "#\n".to_owned() }), ValidToken::Word(ValidWord { buf: "XY".to_owned() }), ], ], }, ]; let actual: Vec<_> = source.shrink().collect(); assert_eq!(actual, expected); } #[test] fn no_sources() { let sources: &[&str] = &[]; assert!(lexer::lex(sources).unwrap().is_empty()); } #[test] fn single_empty_source() { assert!(lexer::lex(&[""]).unwrap().is_empty()); } #[test] fn multiple_empty_sources() { assert!(lexer::lex(&["", ""]).unwrap().is_empty()); } #[test] fn ascii_digit_is_numeric() { let lexemes: &[&str] = &[ "0b0", "0b1", "0B0", "0B1", "0b00", "0b01", "0B00", "0B11", "0o0", "0o7", "0O0", "0O6", "0o00", "0o45", "0O32", "0O14", "0x0", "0xf", "0X0", "0XD", "0x00", "0x0A", "0X00", "0X0b", "0xC", "0Xf", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "010", "0099", "2468", "13570", "34021", "0.0", "00.0", "0.00", "00.00", "1.0", "0.1", "123.468", "10.09", "0.0e0", "1.0e0", "0.1e0", "9.9e9", "1.9e5", "00.00e00", "123.456e012", "0.0E0", "2.0E0", "0.3E0", "4.5E9", "6.7E5", "00.00E00", "123.456E012", "0.0e+0", "6.0e+0", "0.4e+0", "3.6e+9", "1.3e+5", "00.00e+00", "123.456e+012", "0.0E+0", "7.0E+0", "0.3E+0", "4.5E+9", "3.5E+5", "00.00E+00", "456.123E+024", "0.0e-0", "8.0e-0", "0.2e-0", "5.4e-9", "5.7e-5", "00.00e-00", "246.753e-034", "0.0E-0", "9.0E-0", "0.1E-0", "6.3E-9", "7.9E-5", "00.00E-00", "846.195E-029", "53.46e24", "39.81E31", "65.43e+21", "75.64E+13", "13.37e-37", "42.24E-31", ]; for &lexeme in lexemes { let range = lexer::Range { start: lexer::Location { src_idx: 0, line: 1, column: 1 }, end: lexer::Location { src_idx: 0, line: 1, column: 1 + grapheme_count(lexeme) }, }; let sources = &[lexeme]; let tokens = lexer::lex(sources).unwrap(); let token = tokens[0]; assert_eq!(tokens.len(), 1); assert_eq!(token.kind, lexer::TokenKind::Numeric); assert_eq!(token.value, lexeme); assert_eq!(token.range, range); } } #[test] fn unicode_digit_is_not_numeric() { // Characters that are classified as "digit" in Unicode, // but should not be accepted by the lexer as such. let non_numeric_digits: &[&str] = &[ "\u{0660}", "\u{0967}", "\u{0E52}", "\u{0967}", "\u{1814}", "\u{FF15}", ]; // They all express a single grapheme cluster. let err_range = lexer::Range { start: lexer::Location { src_idx: 0, line: 1, column: 1 }, end: lexer::Location { src_idx: 0, line: 1, column: 2 }, }; for &lexeme in non_numeric_digits { let sources = &[lexeme]; let result = lexer::lex(sources); match result { Err(Error::Syntax { ref message, range }) => { assert_eq!(message, "Invalid token"); assert_eq!(range, err_range); }, Err(other) => panic!("Lexer returned a non-syntax error: {}", other), Ok(tokens) => panic!("Lexer unexpectedly accepted {}: {:#?}", lexeme, tokens), } } } #[test] fn identifier_with_inner_unicode_digit() { let identifiers: &[&str] = &[ "x\u{0660}", // Latin Letter followed by Digit "_\u{0967}", // Underscore followed by Digit "E\u{0300}\u{0e52}", // Latin letter + Combining Grave Accent + Digit "\u{0687}\u{08EA}\u{0967}", // Arabic + Nonspacing Mark followed by Digit "\u{4FA1}\u{1814}", // Chinese followed by Digit "\u{0938}\u{A8F1}\u{FF15}", // Devanagari + Combining Avagraha + Digit ]; for &ident in identifiers { let range = lexer::Range { start: lexer::Location { src_idx: 0, line: 1, column: 1 }, end: lexer::Location { src_idx: 0, line: 1, column: 1 + grapheme_count(ident) }, }; let sources = &[ident]; let tokens = lexer::lex(sources).unwrap(); let token = tokens[0]; assert_eq!(tokens.len(), 1); assert_eq!(token.kind, lexer::TokenKind::Word); assert_eq!(token.value, ident); assert_eq!(token.range, range); } } #[test] fn all_valid_punctuation() { for &punct in PUNCTUATION { let range = lexer::Range { start: lexer::Location { src_idx: 0, line: 1, column: 1 }, end: lexer::Location { src_idx: 0, line: 1, column: 1 + grapheme_count(punct) }, }; let sources = &[punct]; let tokens = lexer::lex(sources).unwrap(); let token = tokens[0]; assert_eq!(tokens.len(), 1); assert_eq!(token.kind, lexer::TokenKind::Punctuation); assert_eq!(token.value, punct); assert_eq!(token.range, range); } } #[test] fn windows_newline_in_comment() { let src = ValidSource { files: vec![ vec![ ValidToken::LineComment( ValidLineComment { buf: "#abc\r\n".to_owned() } ), ] ] }; let (srcs, _) = src.render(); let actual = lexer::lex(&srcs).unwrap(); assert_eq!(actual.len(), 1); assert_eq!(actual[0].kind, lexer::TokenKind::Comment); } #[test] fn interesting_valid_sources() { let test_cases: &[_] = &[ // Some identifiers and keywords ("foo", vec!["foo"]), ("bar qux", vec!["bar", " ", "qux"]), ("if", vec!["if"]), ("then", vec!["then"]), ("else", vec!["else"]), ("match", vec!["match"]), ("true", vec!["true"]), ("false", vec!["false"]), ("as", vec!["as"]), ("nil", vec!["nil"]), ("struct", vec!["struct"]), ("class", vec!["class"]), ("enum", vec!["enum"]), ("fn", vec!["fn"]), ("impl", vec!["impl"]), ("let", vec!["let"]), // line comments without a trailing newline ("#", vec!["#"]), ("# baz", vec!["# baz"]), ("#\t", vec!["#\t"]), // without a decimal point, it's not a floating-point literal ("123e456", vec!["123", "e456"]), // .() can be a method call; // .digits can be a tuple field access ("850.method()", vec!["850", ".", "method", "(", ")"]), ("thing.01", vec!["thing", ".", "01"]), // numbers followed directly by a letter are allowed _for now_. Soon they won't be. ("99foo", vec!["99", "foo"]), ("03.14barqux", vec!["03.14", "barqux"]), // Lonely 0[bBoOxX] prefixes are _currently_ recognized as a zero and a letter. Soon they won't be. ("0b", vec!["0", "b"]), ("0B", vec!["0", "B"]), ("0o", vec!["0", "o"]), ("0O", vec!["0", "O"]), ("0x", vec!["0", "x"]), ("0X", vec!["0", "X"]), // .. and ... after a number is a range, not a floating-point literal ("10..99", vec!["10", "..", "99"]), ("32...85", vec!["32", "...", "85"]), ("25.7..0.34", vec!["25.7", "..", "0.34"]), ("3.14...2.71", vec!["3.14", "...", "2.71"]), // Parentheses ("()[]{}", vec!["(", ")", "[", "]", "{", "}"]), ("{[(([{}]))]}", vec!["{", "[", "(", "(", "[", "{", "}", "]", ")", ")", "]", "}"]), // Operators that are repetitions of the same character ("......", vec!["...", "..."]), (".....", vec!["...", ".."]), ("....", vec!["...", "."]), ("::::", vec!["::", "::"]), (":::", vec!["::", ":"]), ("?:", vec!["?", ":"]), ("?::", vec!["?", "::"]), ("====", vec!["==", "=="]), ("===", vec!["==", "="]), (". .....", vec![".", " ", "...", ".."]), (".. ....", vec!["..", " ", "...", "."]), (". .. ...", vec![".", " ", "..", " ", "..."]), (".. .. ..", vec!["..", " ", "..", " ", ".."]), (".. . ...", vec!["..", " ", ".", " ", "..."]), (".. ...", vec!["..", " ", "..."]), ("... ..", vec!["...", " ", ".."]), (".. ..", vec!["..", " ", ".."]), (". ...", vec![".", " ", "..."]), (". . . .", vec![".", " ", ".", " ", ".", " ", "."]), (": :::", vec![":", " ", "::", ":"]), (": ::", vec![":", " ", "::"]), ("? :", vec!["?", " ", ":"]), ("? : :", vec!["?", " ", ":", " ", ":"]), ("=== =", vec!["==", "=", " ", "="]), ("= ==", vec!["=", " ", "=="]), // no decrement/increment, separate logical/bitwise operators, // nor shifts (left/logical right/arithmetic right) ("--", vec!["-", "-"]), ("++", vec!["+", "+"]), ("&&", vec!["&", "&"]), ("||", vec!["|", "|"]), ("<<>>>", vec!["<", "<", ">", ">", ">"]), // no compound assignment operators ( "+=-=*=/=%=&=|=~=", vec!["+", "=", "-", "=", "*", "=", "/", "=", "%", "=", "&", "=", "|", "=", "~", "="] ), // Operators sharing common characters ("->-<->->=><=>=", vec!["->", "-", "<->", "->", "=>", "<=", ">="]), // more complex/realistic sequences of tokens ("abc()", vec!["abc", "(", ")"]), ("def(true, 83)", vec!["def", "(", "true", ",", " ", "83", ")"]), ("qux.lol[index]", vec!["qux", ".", "lol", "[", "index", "]"]), ("let x_0 = \"0\";", vec!["let", " ", "x_0", " ", "=", " ", r#""0""#, ";"]), ( "fn recent_followers(start_date: Date) -> [User] {\n []\n}", vec![ "fn", " ", "recent_followers", "(", "start_date", ":", " ", "Date", ")", " ", "->", " ", "[", "User", "]", " ", "{", "\n ", "[", "]", "\n", "}", ] ), // some programmers just can't be bothered to write spaces between binary ops ( "1.2+999*0.314e+1+2+2718.2818e-3-4", vec!["1.2", "+", "999", "*", "0.314e+1", "+", "2", "+", "2718.2818e-3", "-", "4"] ), ]; for &(source, ref expected) in test_cases { let sources = &[source]; let actual: Vec<_> = lexer::lex(sources) .unwrap() .iter() .map(|token| token.value) .collect(); assert_eq!(actual, *expected); } } #[test] fn invalid_source() { let lexemes: &[&str] = &[ // Invalid punctuation "^", "!", "@", "'", "$", "\\", "`", "´", "–", "—", "…", "˚", "∞", "™", "¡", "¬", "§", "¶", "•", "≈", "˜", "√", "©", "˙", "¨", "“", "”", "‘", "’", "«", "»", "±", // Invalid string literals r#""abc def"#, // Unterminated, no right " r#""\x0z""#, // Invalid character in escape sequence #0 r#""\u{abcdefg}""#, // Invalid character in escape sequence #1 r#""\q""#, // Invalid character in escape sequence #2 r#""\U{20}""#, // Invalid character in escape sequence #3 r#""\X0a""#, // Invalid character in escape sequence #4 r#""\x"#, // Escape sequence too short #0 r#""\x""#, // Escape sequence too short #1 r#""\x9""#, // Escape sequence too short #2 r#""\u""#, // Missing delimiter in Unicode escape #0 r#""\u{""#, // Missing delimiter in Unicode escape #1 r#""\u{}""#, // Escape sequence too short #3 r#"\"#, // Unterminated escape sequence // Tricky Unicode (some borrowed from QuickCheck) "\u{e0001}", // Tag "\u{e0020}", // Tag Space "\u{e000}", // Private Use, etc. "\u{e001}", "\u{ef8ff}", "\u{f0000}", "\u{ffffd}", "\u{ffffe}", "\u{fffff}", "\u{100000}", "\u{10fffd}", "\u{10fffe}", "\u{10ffff}", // Highest valid code point ]; for lexeme in lexemes { match lexer::lex(&[lexeme]) { Ok(tokens) => panic!("Invalid lexeme '{}' was unexpectedly recognized as {:?}", lexeme, tokens), Err(Error::Syntax { message, range }) => { let expected_range = lexer::Range { start: lexer::Location { src_idx: 0, line: 1, column: 1 }, end: lexer::Location { src_idx: 0, line: 1, column: 1 + grapheme_count(lexeme) }, }; assert_eq!(message, "Invalid token"); assert_eq!(range, expected_range); }, Err(err) => panic!("Lexer must always return a syntax error; got: {}", err), } } } #[cfg_attr(feature = "cargo-clippy", allow(needless_pass_by_value))] fn quickcheck_valid_lexeme(lexeme: T) -> bool { let mut source = String::new(); let start = lexer::Location { src_idx: 0, line: 1, column: 1 }; let mut end = start; let expected_kind = lexeme.render(&mut source, &mut end); let range = lexer::Range { start, end }; let sources = [source]; let tokens = lexer::lex(&sources).unwrap(); assert_eq!(tokens.len(), 1); assert_eq!(tokens[0].kind, expected_kind); assert_eq!(tokens[0].range, range); assert_eq!(tokens[0].value, sources[0]); true } quickcheck! { #[allow(trivial_casts)] fn random_sources(sources: Vec) -> bool { match lexer::lex(&sources) { Err(Error::Syntax { message, range }) => { assert_eq!(message, "Invalid token"); assert!(!sources[range.start.src_idx].is_empty()); true }, Err(_) => false, // lexer must always produce a syntax error on failure Ok(tokens) => { sources.iter().all(String::is_empty) && tokens.is_empty() || sources.iter().any(|s| !s.is_empty()) && !tokens.is_empty() }, } } // // Test each valid token kind before more complex tests // #[allow(trivial_casts)] fn valid_whitespace(lexeme: ValidWhitespace) -> bool { quickcheck_valid_lexeme(lexeme) } #[allow(trivial_casts)] fn valid_line_comment(lexeme: ValidLineComment) -> bool { quickcheck_valid_lexeme(lexeme) } #[allow(trivial_casts)] fn valid_word(lexeme: ValidWord) -> bool { quickcheck_valid_lexeme(lexeme) } #[allow(trivial_casts)] fn valid_number(lexeme: ValidNumber) -> bool { quickcheck_valid_lexeme(lexeme) } #[allow(trivial_casts)] fn valid_string(lexeme: ValidString) -> bool { quickcheck_valid_lexeme(lexeme) } #[allow(trivial_casts)] fn valid_punct(lexeme: ValidPunct) -> bool { quickcheck_valid_lexeme(lexeme) } #[allow(trivial_casts)] fn any_valid_token(token: ValidToken) -> bool { quickcheck_valid_lexeme(token) } #[allow(trivial_casts)] fn valid_source(source: ValidSource) -> bool { let (sources, expected) = source.render(); let actual = lexer::lex(&sources).unwrap(); assert_eq!(actual.len(), expected.len()); for (act_tok, exp_tok) in actual.iter().zip(expected) { let src_idx = exp_tok.range.start.src_idx; let slice = exp_tok.slice.clone(); assert_eq!(act_tok.kind, exp_tok.kind); assert_eq!(act_tok.value, &sources[src_idx][slice]); assert_eq!(act_tok.range, exp_tok.range); } true } }