//! Parsers for each part of TOML - keys, values, and arrays. //! //! Parser rules: //! 1. Each parser is only responsible for the length of the data it parses. Extraneous whitespace, //! comments, or invalid characters fall outside the scope of the parsers. //! 2. Parsers assume that the current index in the [`Text`] is the first character of what they //! should parse - ie, the first letter of a key, opening quote of a quoted key, opening bracket //! of a table, etc. //! 3. Each parser should leave `text.idx` at the last byte it parsed. use {crate::crate_prelude::*, std::num::IntErrorKind}; /// Parses a ` = ` assignment. pub fn parse_assignment<'a>(text: &mut Text<'a>) -> Result<(Key<'a>, TomlValue<'a>), Error> { let key = parse_key(text)?; text.idx += 1; text.skip_whitespace(); if text.current_byte() != Some(b'=') { return Err(Error { start: key.text.span().start, end: text.idx, kind: ErrorKind::NoEqualsInAssignment, }); } text.idx += 1; text.skip_whitespace(); if text.idx >= text.end() { return Err(Error { start: key.text.span().start, end: text.idx, kind: ErrorKind::NoValueInAssignment, }); } let value = parse_value(text)?; Ok((key, value)) } /// Parses a key. Supports quoted, dotted, and bare keys. pub fn parse_key<'a>(text: &mut Text<'a>) -> Result, Error> { let maybe_key = match text.current_byte().unwrap() { b'\'' | b'"' => parse_string(text)?, _ => { let start = text.idx; let mut current = text.idx; while let Some(byte) = text.byte(current) { if !byte.is_ascii_alphanumeric() && byte != b'-' && byte != b'_' { break; } current += 1; } if text.byte(current).is_none() { // Text shouldn't end on a key definition return Err(Error { start, end: current, kind: ErrorKind::NoValueInAssignment, }); } if start == current { // Empty bare keys are not allowed return Err(Error { start, end: current, kind: ErrorKind::InvalidBareKey, }); } let span = text.excerpt(start..current); text.idx = current - 1; CowSpan::Raw(span) } }; // Check for dotted key let key_end = text.idx; text.idx += 1; text.skip_whitespace(); if text.current_byte() == Some(b'.') { text.idx += 1; text.skip_whitespace(); Ok(Key { text: maybe_key, child: Some(Box::new(parse_key(text)?)), }) } else { text.idx = key_end; Ok(Key { text: maybe_key, child: None, }) } } /// Parses a value. Supports all of the non-time-related value types. pub fn parse_value<'a>(text: &mut Text<'a>) -> Result, Error> { match text.current_byte().unwrap() { // Integer, time, or float b'0'..=b'9' | b'i' | b'n' => parse_num(text, false), // Integer or float with +/- modifier b'+' if text.remaining_bytes() > 0 => { text.idx += 1; parse_num(text, false) } b'-' if text.remaining_bytes() > 0 => { text.idx += 1; parse_num(text, true) } // String b'\'' | b'"' => parse_string(text).map(TomlValue::String), // Bool b't' | b'f' if text.remaining_bytes() >= 3 => { let span = text.excerpt(text.idx..text.idx + 4); if span.as_str() == "true" { text.idx = span.end; return Ok(TomlValue::Boolean(true)); } else if span.as_str() == "fals" && text.byte(text.idx + 4) == Some(b'e') { text.idx = span.end + 1; return Ok(TomlValue::Boolean(false)); } let span = text.excerpt(text.idx..); Err(Error { start: span.start, end: span.find_next_whitespace_or_newline().unwrap_or(text.end()), kind: ErrorKind::UnrecognisedValue, }) } // Array b'[' => { if text.remaining_bytes() == 0 { return Err(Error { start: text.idx, end: text.idx, kind: ErrorKind::UnclosedBracket, }); } let mut array = Vec::new(); let mut span = text.excerpt(text.idx..); let mut seen_comma = true; text.idx += 1; loop { text.skip_whitespace_and_newlines(); match text.current_byte() { Some(b']') => break, Some(b',') => { text.idx += 1; text.skip_whitespace_and_newlines(); if text.remaining_bytes() == 0 { return Err(Error { start: span.start, end: text.idx, kind: ErrorKind::UnclosedBracket, }); } seen_comma = true; continue; } Some(b'#') => { text.idx = text.excerpt(text.idx..).find(b'\n').unwrap_or(text.end()); text.skip_whitespace_and_newlines(); continue; } Some(_) if !seen_comma => { return Err(Error { start: text.idx, end: text.idx, kind: ErrorKind::NoCommaDelimeter, }) } Some(_) => {} None => { return Err(Error { start: span.start, end: text.idx, kind: ErrorKind::UnclosedBracket, }) } } let value = parse_value(text)?; array.push(value); span.end = text.idx; text.idx += 1; seen_comma = false; } Ok(TomlValue::Array(array)) } // Inline table b'{' => { if text.remaining_bytes() == 0 { return Err(Error { start: text.idx, end: text.idx, kind: ErrorKind::UnclosedBracket, }); } let mut table = Table::default(); let mut span = text.excerpt(text.idx..); text.idx += 1; loop { text.skip_whitespace(); // Empty table if text.current_byte() == Some(b'}') { break; } let (key, value) = parse_assignment(text)?; let start = key.text.span().start; let end = key.text.span().end; let old_value = table.insert(key, value); if old_value { return Err(Error { start, end, kind: ErrorKind::ReusedKey, }); } span.end = text.idx; text.idx += 1; text.skip_whitespace(); match text.current_byte() { Some(b'}') => break, Some(b',') => {} Some(_) => { return Err(Error { start: text.idx, end: text.idx, kind: ErrorKind::NoCommaDelimeter, }) } None => { return Err(Error { start: span.start, end: span.end, kind: ErrorKind::UnclosedBracket, }) } } text.idx += 1; } Ok(TomlValue::Table(table)) } // ¯\_(ツ)_/¯ _ => { let span = text.excerpt(text.idx..); Err(Error { start: span.start, end: span.find_next_whitespace_or_newline().unwrap_or(text.end()), kind: ErrorKind::UnrecognisedValue, }) } } } fn parse_num<'a>(text: &mut Text<'a>, negative: bool) -> Result, Error> { let mut span = Span { start: text.idx, end: text.idx, source: text.text, }; // inf or nan let current_byte = text.current_byte().unwrap(); if (current_byte == b'i' || current_byte == b'n') && text.remaining_bytes() >= 2 { span.end += 2; if span.as_str() == "inf" { text.idx = span.end; if negative { return Ok(TomlValue::Float(-f64::INFINITY)); } else { return Ok(TomlValue::Float(f64::INFINITY)); } } else if span.as_str() == "nan" { text.idx = span.end; if negative { return Ok(TomlValue::Float(-f64::NAN)); } else { return Ok(TomlValue::Float(f64::NAN)); } } } let mut has_underscores = false; let mut is_float = false; let mut is_time = false; // Custom radix let radix = if current_byte == b'0' { match text.byte(span.end + 1) { Some(b'b') => { span.end += 1; while let Some(byte) = text.byte(span.end + 1) { if byte == b'0' || byte == b'1' { span.end += 1; } else if byte == b'_' { has_underscores = true; span.end += 1; } else { break; } } Some(2) } Some(b'o') => { span.end += 1; while let Some(byte) = text.byte(span.end + 1) { match byte { b'0'..=b'7' => span.end += 1, b'_' => { has_underscores = true; span.end += 1; } _ => break, } } Some(8) } Some(b'x') => { span.end += 1; while let Some(byte) = text.byte(span.end + 1) { match byte { b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => span.end += 1, b'_' => { has_underscores = true; span.end += 1; } _ => break, } } Some(16) } _ => None, } } else { None }; if radix.is_none() { let mut has_dash = false; while let Some(byte) = text.byte(span.end + 1) { match byte { b'0'..=b'9' => {} b'.' | b'e' | b'E' | b'+' => is_float = true, b':' => is_time = true, // Can be in floats (1e-4) and time (1974-12-03) b'-' => has_dash = true, b'_' => has_underscores = true, _ => break, } span.end += 1; } if is_float && is_time { return Err(Error { start: span.start, end: span.end, kind: ErrorKind::InvalidNumber, }); } else if !is_float && has_dash { is_time = true; } } if radix.is_some() { span.start += 2; } text.idx = span.end; let source = if has_underscores { let mut string = String::with_capacity(span.len()); for char_ in span.as_str().chars() { if char_ != '_' { string.push(char_); } } CowSpan::Modified(span, string) } else { CowSpan::Raw(span) }; let span = source.span(); if is_float { // Unfortunately, the f64 parser doesn't give detailed error information, so this is the best we can do. if let Ok(num) = source.as_str().parse::() { if negative { return Ok(TomlValue::Float(-num)); } else { return Ok(TomlValue::Float(num)); } } } if is_time && !negative { todo!("Time types") } match i64::from_str_radix(source.as_str(), radix.unwrap_or(10)) { Ok(num) => { if negative { return Ok(TomlValue::Integer(-num)); } else { return Ok(TomlValue::Integer(num)); } } Err(e) => match e.kind() { IntErrorKind::PosOverflow => { // i64::MIN, as a string, without the sign if negative && source.as_str() == "9223372036854775808" { return Ok(TomlValue::Integer(i64::MIN)); } return Err(Error { start: span.start, end: span.end, kind: ErrorKind::NumberTooLarge, }); } IntErrorKind::InvalidDigit => {} IntErrorKind::Empty => { return Err(Error { start: span.start, end: span.end, kind: ErrorKind::InvalidNumber, }) } _ => unreachable!(), }, } Err(Error { start: span.start, end: span.find_next_whitespace_or_newline().unwrap_or(text.end()), kind: ErrorKind::UnrecognisedValue, }) } /// Parses a string. Supports literal and basic strings. Handles basic string escapes /// automatically. pub fn parse_string<'a>(text: &mut Text<'a>) -> Result, Error> { let mut span = text.excerpt(text.idx..); match text.current_byte().unwrap() { b'\'' => { let (end, offset) = if text.remaining_bytes() > 5 && text.excerpt(text.idx..text.idx + 3).to_str() == "'''" { // Multi-line string span.start += 3; if text.byte(span.start).unwrap() == b'\n' { span.start += 1; } ( span.as_str().find("'''").map(|idx| { let mut idx = span.start + idx; while text.byte(idx) == Some(b'\'') { idx += 1; } idx - 3 }), 3, ) } else { // Single-line string span.start += 1; (span.find(b'\''), 1) }; let Some(end) = end else { return Err(Error { start: text.idx, end: span.find_next_whitespace_or_newline().unwrap_or(text.end()), kind: ErrorKind::UnclosedString, }); }; span.end = end - 1; text.idx = span.end + offset; Ok(CowSpan::Raw(span)) } b'"' => { let multiline = text.remaining_bytes() > 5 && text.excerpt(text.idx..text.idx + 3).to_str() == "\"\"\""; let offset = if multiline { 3 } else { 1 }; let start = span.start; let Some(end) = find_basic_string_end(&mut span, text, multiline) else { return Err(Error { start: text.idx, end: span.find_next_whitespace_or_newline().unwrap_or(text.end()), kind: ErrorKind::UnclosedString, }); }; span.start = start + offset; span.end = end - 1; if multiline && text.byte(span.start).unwrap() == b'\n' { span.start += 1; } text.idx = span.end + offset; if span.find(b'\\').is_some() { handle_basic_string_escapes(text, span) } else { Ok(CowSpan::Raw(span)) } } _ => unreachable!(), } } fn find_basic_string_end(span: &mut Span<'_>, text: &Text<'_>, multiline: bool) -> Option { let end = if multiline { // Multi-line string span.start += 3; span.as_str().find("\"\"\"").map(|idx| { let mut idx = span.start + idx; while text.byte(idx) == Some(b'"') { idx += 1; } idx - 3 }) } else { // Single-line string span.start += 1; span.find(b'"') }; if let Some(end) = end { if text.byte(end - 1).unwrap() == b'\\' && text.byte(end - 2).unwrap() != b'\\' { span.start = end; find_basic_string_end(span, text, multiline) } else { Some(end) } } else { None } } fn handle_basic_string_escapes<'a>(text: &Text<'a>, span: Span<'a>) -> Result, Error> { let mut string = String::with_capacity(span.len()); let mut chars = span.as_str().char_indices().peekable(); while let Some((idx, char)) = chars.next() { let idx = span.start + idx; if char == '\\' { let Some((idx, char)) = chars.next() else { return Err(Error { start: idx, end: idx, kind: ErrorKind::UnknownEscapeSequence, }); }; let idx = span.start + idx; let to_push = match char { 'b' => '\u{0008}', 't' => '\t', 'n' => '\n', 'f' => '\u{000C}', 'r' => '\r', '"' => '"', '\\' => '\\', 'u' => { if idx + 4 > text.end() { return Err(Error { start: idx, end: idx + 4, kind: ErrorKind::UnknownUnicodeScalar, }); } let source = text.excerpt(idx + 1..=idx + 4); let Some(char) = u32::from_str_radix(source.as_str(), 16) .ok() .and_then(char::from_u32) else { return Err(Error { start: idx, end: idx + 5, kind: ErrorKind::UnknownUnicodeScalar, }); }; chars.nth(3).unwrap(); char } 'U' => { if idx + 8 > text.end() { return Err(Error { start: idx, end: idx + 8, kind: ErrorKind::UnknownUnicodeScalar, }); } let source = text.excerpt(idx + 1..=idx + 8); let Some(char) = u32::from_str_radix(source.as_str(), 16) .ok() .and_then(char::from_u32) else { return Err(Error { start: idx, end: idx + 8, kind: ErrorKind::UnknownUnicodeScalar, }); }; chars.nth(7).unwrap(); char } ' ' | '\t' | '\n' | '\r' => { while let Some((_, char_)) = chars.peek() { let char_ = *char_; if char_ != ' ' && char_ != '\t' && char_ != '\n' && char_ != '\r' { break; } chars.next(); } continue; } _ => { return Err(Error { start: span.start + idx, end: span.start + idx + 1, kind: ErrorKind::UnknownEscapeSequence, }) } }; string.push(to_push); continue; } string.push(char); } Ok(CowSpan::Modified(span, string)) }