// Copyright (c) 2016-2021 Fabian Schuiki //! Lexer utilities. use crate::errors::DiagResult; use std::cmp::max; use std::collections::VecDeque; use std::io::Read; /// A trait that can supply a peekable stream of characters. pub trait Reader { fn peek(&mut self, offset: usize) -> Option; fn consume(&mut self, amount: usize); fn clear(&mut self); fn to_string(&self) -> String; } /// A trait that can supply a stream of tokens. pub trait Lexer { fn next_token<'td>(&mut self) -> DiagResult<'td, T>; } pub struct AccumulatingReader { rd: Box, buf: Vec, base: usize, pos: usize, tail: usize, } impl AccumulatingReader { pub fn new(rd: Box) -> AccumulatingReader { AccumulatingReader { rd: rd, buf: Vec::new(), base: 0, pos: 0, tail: 0, } } /// Grow and fill the internal buffer such that at least min_len characters /// are present, or the end of the file has been reached. This function may /// shift the buffer contents around, in which case previous buffer indices /// become invalid. Recalculate all indices derived from `base`, `pos`, or /// `tail` after a call to this function. pub fn refill(&mut self, mut min_len: usize) { // Move the buffer contents to the beginning to make more space at the // end. if self.base > 0 { for i in 0..(self.tail - self.base) { self.buf[i] = self.buf[self.base + i] } self.pos -= self.base; self.tail -= self.base; min_len -= self.base; self.base = 0; } assert!(self.base == 0); // Increase the buffer size to make sure the requested min_len can be // accomodated. let cur_len = self.buf.len(); if min_len > cur_len { let new_len = max(cur_len * 2, 32); self.buf.resize(new_len, 0); } assert!(self.buf.len() >= min_len); // Keep reading data until at least min_len bytes are in the buffer. // Note that this will usually fill in the entire buffer. while min_len > self.tail { let nread = { let dst = &mut self.buf[self.tail..]; let nread = self.rd.read(dst).unwrap(); nread }; if nread == 0 { break; } self.tail += nread; } } /// Return a slice of the consumed bytes. pub fn slice(&self) -> &[u8] { &self.buf[self.base..self.pos] } /// Return a slice of the remaining bytes, starting at the last call to /// clear(). pub fn rem_slice(&self) -> &[u8] { &self.buf[self.base..] } } impl Reader for AccumulatingReader { /// Return the value of the byte that is `off` bytes away from the current /// position in the input file. If the `off` lies beyond the end of file, /// `None` is returned. fn peek(&mut self, off: usize) -> Option { // If the requested offset lies outside the chunk of the file that is // currently in memory, refill the buffer first. let idx = self.pos + off; if idx >= self.tail { self.refill(idx + 1) } // The previous call to refill may move things around in the buffer, so // the index needs to be recalculated. If it lies beyond what we were // able to read from the file, as indicated by `self.tail`, return // `None` to indicate the end of the file. let idx = self.pos + off; if idx < self.tail { Some(self.buf[idx] as char) } else { None } } /// Consume the next `amt` bytes of the input. All consumed bytes since the /// last `clear()` can be accessed via `slice()` or `to_string()`. fn consume(&mut self, amt: usize) { self.pos += amt } /// Clear the consumed bytes. fn clear(&mut self) { self.base = self.pos } /// Convert the consumed bytes to a String. fn to_string(&self) -> String { // TODO: Don't just unwrap, but handle the case where the input file is // not valid UTF8. String::from_utf8(self.slice().to_vec()).unwrap() } } pub struct StringReader<'tdata> { // data: &'tdata str, iter: Box + 'tdata>, buf: Vec, base: usize, pos: usize, tail: usize, } impl<'tdata> StringReader<'tdata> { pub fn new(data: &'tdata str) -> StringReader<'tdata> { StringReader { // data: data, iter: Box::new(data.chars()), buf: Vec::new(), base: 0, pos: 0, tail: 0, } } fn refill(&mut self, mut min_len: usize) { // Move the buffer contents to the beginning to make more space at the // end. if self.base > 0 { for i in 0..(self.tail - self.base) { self.buf[i] = self.buf[self.base + i] } self.pos -= self.base; self.tail -= self.base; min_len -= self.base; self.base = 0; } assert!(self.base == 0); // Increase the buffer size to make sure the requested min_len can be // accomodated. let cur_len = self.buf.len(); if min_len > cur_len { let new_len = max(cur_len * 2, 32); self.buf.resize(new_len, 0 as char); } assert!(self.buf.len() >= min_len); // Keep reading data until at least min_len bytes are in the buffer. // Note that this will usually fill in the entire buffer. while min_len > self.tail { match self.iter.next() { Some(c) => { self.buf[self.tail] = c; self.tail += 1; } None => break, } } } } impl<'tdata> Reader for StringReader<'tdata> { fn peek(&mut self, offset: usize) -> Option { // If the requested offset lies outside the chunk of characters that has // been parsed, refill the buffer first. let idx = self.pos + offset; if idx >= self.tail { self.refill(idx + 1) } // The previous call to refill may move things around in the buffer, so // the index needs to be recalculated. If it lies beyond what we were // able to parse from the string, as indicated by `self.tail`, return // `None` to indicate the end of the file. let idx = self.pos + offset; if idx < self.tail { Some(self.buf[idx]) } else { None } } fn consume(&mut self, amount: usize) { self.pos += amount; } fn clear(&mut self) { self.base = self.pos; } fn to_string(&self) -> String { let mut s = String::new(); for c in self.buf[self.base..self.pos].iter() { s.push(*c); } s } } /// A lexer chaining the tokens of multiple other lexers after another. Lexers /// can be pushed onto an internal stack and will then be queried for tokens /// until their respective Eof has been reached. At that point, the lexer is /// popped off the stack and the next lexer's tokens are produced. An Eof is /// returned once all lexers have been drained. pub struct StackedLexer { stack: Vec>>, eof: T, } impl StackedLexer { pub fn new(eof: T) -> StackedLexer { StackedLexer { stack: Vec::new(), eof: eof, } } pub fn push(&mut self, lexer: Box>) { self.stack.push(lexer); } } impl Lexer for StackedLexer { fn next_token<'td>(&mut self) -> DiagResult<'td, T> { loop { let token = if let Some(lexer) = self.stack.last_mut() { lexer.next_token()? } else { return Ok(self.eof.clone()); }; if token == self.eof { self.stack.pop(); continue; } else { return Ok(token); } } } } /// A buffered lexer that allows tokens to be peeked at before they are actually /// consumed. pub struct BufferedLexer { inner: Box>, eof: T, buffer: VecDeque, done: bool, } impl BufferedLexer { /// Create a new buffered lexer. pub fn new(inner: Box>, eof: T) -> BufferedLexer { BufferedLexer { inner: inner, eof: eof, buffer: VecDeque::new(), done: false, } } /// Peek at a token not yet consumed. This function merely returns a /// reference to said token. Use `pop()` to advance the lexer. pub fn peek<'td>(&mut self, offset: usize) -> DiagResult<'td, &T> { // Fill the buffer up to the offset that should be peeked at. If an Eof is // encountered beforehand, set the `done` flag and abort. while !self.done && self.buffer.len() <= offset { let token = self.inner.next_token()?; if token == self.eof { self.done = true; } self.buffer.push_back(token); } // Return the token at the requested offset, or the Eof token if the // offset lies beyond the end of the stream. Ok(self.buffer.get(offset).unwrap_or(&self.eof)) // if offset < self.buffer.len() { // } else { // Ok(self.eof.clone()) // } } /// Insert a token in front of the stream such that it becomes the next /// token to be returned from `peek(0)` or `pop()`. #[allow(unused_variables)] pub fn push(&mut self, token: T) { unimplemented!(); } /// Consume and return the current token. This is the same token that would /// be returned by `peek(0)`. pub fn pop<'td>(&mut self) -> DiagResult<'td, T> { if self.buffer.is_empty() && !self.done { self.inner.next_token() } else { Ok(self .buffer .pop_front() .expect("pop() called on drained BufferedLexer")) } } pub fn inner(&self) -> &dyn Lexer { &*self.inner } pub fn inner_mut(&mut self) -> &mut dyn Lexer { &mut *self.inner } }