/* * Copyright 2022 Arnaud Golfouse * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ // Note that we could technically use this crate to parse this... but the errors would // be near impossible to debug :/ use ielr::{ input::{Grammar, Node, ProdIdx, Symbol, Token}, Algorithm, }; use std::{ collections::{hash_map::Entry, HashMap}, num::NonZeroU8, }; mod conflict; mod example; mod forbid; mod precedence; pub struct GrammarFormat { pub algorithm: Algorithm, pub grammar: Grammar, pub symbols: HashMap, pub examples: Vec>, pub examples_fail: Vec>, } #[derive(Clone, Copy)] enum Modifier { None, /// 0 or 1 Question, /// 0 or more Star, /// 1 or more Plus, } /// The grammar format is: /// - One production per line. /// - Empty lines are ignored. /// - Start with the left-hand side, then a ':', then the right-hand side. /// - Nodes are CamelCased, tokens are SCREAMING_CASE. /// - Symbols other than ':' and letters are individual tokens. /// - Multiple productions with the same lhs may be specified by starting a new line /// with `|`, then the rhs. /// /// # Special annotations /// /// ## Examples /// `@example(...)` will try to read the list of tokens in `...` with the generated /// parser. /// /// ## Conflicts /// A conflict resolution has the form `@conflict(c1 > c2)` or `@conflict(c1 < c2)`, /// where /// - `c1` and `c2` are either: /// - `reduce(Node, i)`: means 'reduction of the `i`-th production with lhs `Node` /// - `shift(Token)` /// /// ## Precedence /// TODO: handle multiple precedence families. /// /// A precedence annotation has the form `@precedence(Node, i, left = a, right = b)`, /// where /// - `Node, i` selects the `i`-th production with lhs `Node`. /// - `a` is the left precedence, `b` is the right precedence. Both `left = a` and /// `right = b` are optional. pub fn read_grammar(mut file: &str) -> Result { // skip leading whitespace and comments while file.starts_with("//") || file.starts_with('\n') { if let Some(end_line) = file.find('\n') { file = &file[end_line + 1..]; } else { file = "" } } let algorithm = if let Some(new_file) = file.strip_prefix("@Algorithm(") { let close_parent = new_file .find(')') .ok_or("no closing parenthesis for @Algorithm")?; let algo_type = &new_file[..close_parent]; file = &new_file[close_parent + 1..]; let comma = algo_type.find(',').ok_or("no comma for @Algorithm")?; let before_comma = algo_type[..comma].trim(); let after_comma = algo_type[comma + 1..].trim(); let k = after_comma .parse::() .map_err(|err| err.to_string())?; match before_comma { "lalr" => Algorithm::Lalr(k), "lr" => Algorithm::Lr(k), _ => return Err(format!("unknown algorithm '{before_comma}'")), } } else { Algorithm::default() }; let mut nodes: HashMap<&str, (Node, Vec)> = HashMap::new(); let mut tokens: HashMap<&str, Token> = HashMap::new(); let mut next_node = 0; let mut next_node = move || { next_node += 1; (Node(next_node - 1), Vec::new()) }; let mut next_token = 1; let mut next_token = move || { next_token += 1; Token::new(next_token - 1).unwrap() }; let mut grammar = Grammar::new(); let mut examples = Vec::new(); let mut examples_fail = Vec::new(); let mut last_lhs: Option<(&str, Node)> = None; let precedence_family = grammar.add_precedence_family(); for (line_number, mut line) in file.lines().enumerate() { let line_number = line_number + 1; line = line.trim(); if line.is_empty() || line.starts_with("//") { continue; } else if let Some(line) = line.strip_prefix("@example_fail") { let example = example::parse_example_fail(line, &tokens)?; examples_fail.push(example); continue; } else if let Some(line) = line.strip_prefix("@example") { let example = example::parse_example(line, &tokens)?; examples.push(example); continue; } else if let Some(line) = line.strip_prefix("@conflict") { let conflict_solution = conflict::parse_conflict(line, &nodes, &tokens)?; grammar.add_conflict_solution(conflict_solution); continue; } else if let Some(line) = line.strip_prefix("@forbid") { let (prod_idx, index, forbidden) = forbid::parse_forbid(line, &nodes)?; grammar .get_production_mut(prod_idx) .unwrap() .forbid_derivation(index, forbidden) .map_err(|_| "invalid forbid".to_string())?; continue; } else if let Some(line) = line.strip_prefix("@precedence") { let (prod, left, right) = precedence::parse_precedence(line, &nodes)?; let prod = grammar.get_production_mut(prod).unwrap(); if let Some(left) = left { prod.set_left_precedence(precedence_family, left); } if let Some(right) = right { prod.set_right_precedence(precedence_family, right); } continue; } let (lhs_word, lhs, words) = if let Some(rest) = line.strip_prefix('|') { let (word, lhs) = match last_lhs { Some(lhs) => lhs, None => return Err(format!("[line {line_number}] no lhs before '|'")), }; (word, lhs, rest.split_whitespace()) } else { match next_word(line) { None => continue, Some((word, rest)) => { let rest = match rest.strip_prefix(':') { Some(r) => r, None => { return Err(format!("[line {line_number}] Expected ':' after node")) } }; if !is_node(word) { return Err(format!( "[line {line_number}] word '{word}' is not a node !" )); } #[allow(clippy::needless_borrow)] let lhs = nodes.entry(word).or_insert_with(&mut next_node).0; (word, lhs, rest.split_whitespace()) } } }; last_lhs = Some((lhs_word, lhs)); let mut rhs = Vec::new(); for word in words { let (word, modified_word, modifier) = if word.len() == 1 { (word, word, Modifier::None) } else if let Some(modified) = word.strip_suffix('?') { (word, modified, Modifier::Question) } else if let Some(modified) = word.strip_suffix('*') { (word, modified, Modifier::Star) } else if let Some(modified) = word.strip_suffix('+') { (word, modified, Modifier::Plus) } else { (word, word, Modifier::None) }; if modified_word == "EOF" { return Err(String::from("EOF is a reserved identifier")); } #[allow(clippy::needless_borrow)] let symbol = if is_node(modified_word) { Symbol::Node(nodes.entry(modified_word).or_insert_with(&mut next_node).0) } else { Symbol::Token(*tokens.entry(modified_word).or_insert_with(&mut next_token)) }; let symbol = match modifier { Modifier::None => symbol, Modifier::Question => { let question = match nodes.entry(word) { Entry::Occupied(occupied) => occupied.get().0, Entry::Vacant(vacant) => { let (question, productions) = vacant.insert(next_node()); productions .push(grammar.add_production(*question, vec![symbol]).unwrap()); productions .push(grammar.add_production(*question, Vec::new()).unwrap()); *question } }; Symbol::Node(question) } Modifier::Star => { let star = match nodes.entry(word) { Entry::Occupied(occupied) => occupied.get().0, Entry::Vacant(vacant) => { let (star, productions) = vacant.insert(next_node()); productions.push( grammar .add_production(*star, vec![symbol, Symbol::Node(*star)]) .unwrap(), ); productions.push(grammar.add_production(*star, Vec::new()).unwrap()); *star } }; Symbol::Node(star) } Modifier::Plus => { let plus = match nodes.entry(word) { Entry::Occupied(occupied) => occupied.get().0, Entry::Vacant(vacant) => { let (plus, productions) = vacant.insert(next_node()); productions.push( grammar .add_production(*plus, vec![symbol, Symbol::Node(*plus)]) .unwrap(), ); productions.push(grammar.add_production(*plus, vec![symbol]).unwrap()); *plus } }; Symbol::Node(plus) } }; rhs.push(symbol); } nodes .get_mut(lhs_word) .unwrap() .1 .push(grammar.add_production(lhs, rhs).unwrap()); } let mut symbol_to_str = HashMap::new(); for (s, (node, _)) in nodes { symbol_to_str.insert(Symbol::Node(node), s.to_string()); } for (s, token) in tokens { symbol_to_str.insert(Symbol::Token(token), s.to_string()); } Ok(GrammarFormat { algorithm, grammar, symbols: symbol_to_str, examples, examples_fail, }) } fn next_word(mut input: &str) -> Option<(&str, &str)> { input = input.trim_start(); let mut chars_indices = input.char_indices(); let (_, next_char) = chars_indices.next()?; if next_char.is_alphabetic() { for (index, c) in chars_indices { if !(c.is_alphanumeric() || c == '_') { let to_return = &input[..index]; input = &input[index..]; return Some((to_return, input)); } } let to_return = input; Some((to_return, "")) } else { let to_return = match chars_indices.next() { Some((index, _)) => { let to_return = &input[..index]; input = &input[index..]; to_return } None => { let to_return = input; input = ""; to_return } }; Some((to_return, input)) } } fn is_node(word: &str) -> bool { let mut chars = word.chars(); let mut all_uppercase = true; match chars.next() { Some(c) if c.is_alphabetic() && c.is_uppercase() => {} _ => return false, } for ch in chars { if !(ch.is_alphanumeric() || ch == '_') { return false; } if ch.is_alphabetic() && ch.is_lowercase() { all_uppercase = false; } } !all_uppercase } fn parse_number(line: &str) -> Result<(&str, u64), String> { let mut num_index = 0; for c in line.chars() { if ('0'..='9').contains(&c) { num_index += 1; } else { break; } } let (number, line) = line.split_at(num_index); Ok((line, number.parse::().map_err(|err| err.to_string())?)) }