From d7b50fc52daedc9c8cfe9682f88f2d0b2fbc2d2f Mon Sep 17 00:00:00 2001 From: gwenn Date: Wed, 8 Mar 2023 20:08:20 +0100 Subject: [PATCH 1/3] Drop support of buf_redux / streaming input --- Cargo.toml | 3 +- examples/sql_check.rs | 8 +- examples/sql_cmds.rs | 8 +- examples/sql_tokens.rs | 9 +- src/lexer/mod.rs | 4 +- src/lexer/scan.rs | 208 +++------------------------ src/lexer/sql/mod.rs | 311 ++++++++++++++++++----------------------- src/parser/mod.rs | 2 +- 8 files changed, 166 insertions(+), 387 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 304124a..65bdc24 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ YYNOERRORRECOVERY = [] YYSTACKDYNAMIC = [] YYCOVERAGE = [] NDEBUG = [] -default = ["YYNOERRORRECOVERY", "buf_redux"] +default = ["YYNOERRORRECOVERY"] [dependencies] phf = { version = "0.11", features = ["uncased"] } @@ -30,7 +30,6 @@ log = "0.4" memchr = "2.0" fallible-iterator = "0.2" smallvec = ">=1.6.1" -buf_redux = { version = "0.8", optional = true } bitflags = "1.3" uncased = "0.9" indexmap = "1.9" diff --git a/examples/sql_check.rs b/examples/sql_check.rs index c772733..1e7b05c 100644 --- a/examples/sql_check.rs +++ b/examples/sql_check.rs @@ -1,10 +1,9 @@ use fallible_iterator::FallibleIterator; use std::env; -use std::fs::File; +use std::fs::read; use std::panic; use sqlite3_parser::lexer::sql::Parser; -use sqlite3_parser::lexer::InputStream; /// Parse specified files and check all commands. fn main() { @@ -13,9 +12,8 @@ fn main() { for arg in args.skip(1) { println!("{arg}"); let result = panic::catch_unwind(|| { - let f = File::open(arg.clone()).unwrap(); - let input = InputStream::new(f); - let mut parser = Parser::new(input); + let input = read(arg.clone()).unwrap(); + let mut parser = Parser::new(&input); loop { match parser.next() { Ok(None) => break, diff --git a/examples/sql_cmds.rs b/examples/sql_cmds.rs index 826beb8..8ebdeb0 100644 --- a/examples/sql_cmds.rs +++ b/examples/sql_cmds.rs @@ -1,10 +1,9 @@ use fallible_iterator::FallibleIterator; use std::env; -use std::fs::File; +use std::fs::read; use std::panic; use sqlite3_parser::lexer::sql::Parser; -use sqlite3_parser::lexer::InputStream; /// Parse specified files and print all commands. fn main() { @@ -13,9 +12,8 @@ fn main() { for arg in args.skip(1) { println!("{arg}"); let result = panic::catch_unwind(|| { - let f = File::open(arg.clone()).unwrap(); - let input = InputStream::new(f); - let mut parser = Parser::new(input); + let input = read(arg.clone()).unwrap(); + let mut parser = Parser::new(input.as_ref()); loop { match parser.next() { Ok(None) => break, diff --git a/examples/sql_tokens.rs b/examples/sql_tokens.rs index efffb9e..c9834c7 100644 --- a/examples/sql_tokens.rs +++ b/examples/sql_tokens.rs @@ -1,8 +1,8 @@ use sqlite3_parser::lexer::sql::{TokenType, Tokenizer}; -use sqlite3_parser::lexer::{InputStream, Scanner}; +use sqlite3_parser::lexer::Scanner; use std::env; -use std::fs::File; +use std::fs::read; use std::i64; use std::str; @@ -11,10 +11,9 @@ fn main() { use TokenType::*; let args = env::args(); for arg in args.skip(1) { - let f = File::open(arg.clone()).unwrap(); - let input = InputStream::new(f); + let input = read(arg.clone()).unwrap(); let tokenizer = Tokenizer::new(); - let mut s = Scanner::new(input, tokenizer); + let mut s = Scanner::new(&input, tokenizer); loop { match s.scan() { Ok(None) => break, diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 3e900cc..953ff76 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -3,6 +3,4 @@ mod scan; pub mod sql; -#[cfg(feature = "buf_redux")] -pub use scan::InputStream; -pub use scan::{Input, ScanError, Scanner, Splitter}; +pub use scan::{ScanError, Scanner, Splitter}; diff --git a/src/lexer/scan.rs b/src/lexer/scan.rs index 34648e9..985f8be 100644 --- a/src/lexer/scan.rs +++ b/src/lexer/scan.rs @@ -6,170 +6,6 @@ use std::error::Error; use std::fmt; use std::io; -#[cfg(feature = "buf_redux")] -use buf_redux::Buffer; -#[cfg(feature = "buf_redux")] -const MAX_CAPACITY: usize = 1024 * 1024 * 1024; - -pub trait Input: fmt::Debug { - fn fill_buf(&mut self) -> io::Result<()>; // -> io::Result<&[u8]>; - fn eof(&self) -> bool; //&mut self -> io::Result - fn consume(&mut self, amount: usize); // -> &[u8] - fn buffer(&self) -> &[u8]; - fn is_empty(&self) -> bool; - fn len(&self) -> usize; -} - -/// Memory input -impl Input for &[u8] { - #[inline] - fn fill_buf(&mut self) -> io::Result<()> { - Ok(()) - } - - #[inline] - fn eof(&self) -> bool { - true - } - - #[inline] - fn consume(&mut self, amt: usize) { - *self = &self[amt..]; - } - - #[inline] - fn buffer(&self) -> &[u8] { - self - } - - #[inline] - fn is_empty(&self) -> bool { - (*self).is_empty() - } - - #[inline] - fn len(&self) -> usize { - (*self).len() - } -} - -impl Input for Vec { - #[inline] - fn fill_buf(&mut self) -> io::Result<()> { - Ok(()) - } - - #[inline] - fn eof(&self) -> bool { - true - } - - #[inline] - fn consume(&mut self, amt: usize) { - self.drain(..amt); - } - - #[inline] - fn buffer(&self) -> &[u8] { - self - } - - #[inline] - fn is_empty(&self) -> bool { - self.is_empty() - } - - #[inline] - fn len(&self) -> usize { - self.len() - } -} - -/// Streaming input -#[cfg(feature = "buf_redux")] -pub struct InputStream { - /// The reader provided by the client. - inner: R, - /// Buffer used as argument to split. - buf: Buffer, - eof: bool, -} - -#[cfg(feature = "buf_redux")] -impl InputStream { - pub fn new(inner: R) -> Self { - Self::with_capacity(inner, 4096) - } - - fn with_capacity(inner: R, capacity: usize) -> Self { - let buf = Buffer::with_capacity_ringbuf(capacity); - InputStream { - inner, - buf, - eof: false, - } - } -} - -#[cfg(feature = "buf_redux")] -impl Input for InputStream { - fn fill_buf(&mut self) -> io::Result<()> { - debug!(target: "scanner", "fill_buf: {}", self.buf.capacity()); - // Is the buffer full? If so, resize. - if self.buf.free_space() == 0 { - let mut capacity = self.buf.capacity(); - if capacity * 2 < MAX_CAPACITY { - capacity *= 2; - self.buf.make_room(); - self.buf.reserve(capacity); - } else { - return Err(io::Error::from(io::ErrorKind::UnexpectedEof)); // FIXME - } - } else if self.buf.usable_space() == 0 { - self.buf.make_room(); - } - // Finally we can read some input. - let sz = self.buf.read_from(&mut self.inner)?; - self.eof = sz == 0; - Ok(()) - } - - #[inline] - fn eof(&self) -> bool { - self.eof - } - - #[inline] - fn consume(&mut self, amt: usize) { - self.buf.consume(amt); - } - - #[inline] - fn buffer(&self) -> &[u8] { - self.buf.buf() - } - - #[inline] - fn is_empty(&self) -> bool { - self.buf.is_empty() - } - - #[inline] - fn len(&self) -> usize { - self.buf.len() - } -} - -#[cfg(feature = "buf_redux")] -impl fmt::Debug for InputStream { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("InputStream") - .field("input", &self.buf) - .field("eof", &self.eof) - .finish() - } -} - pub trait ScanError: Error + From + Sized { fn position(&mut self, line: u64, column: usize); } @@ -186,19 +22,15 @@ pub trait Splitter: Sized { type TokenType; /// The arguments are an initial substring of the remaining unprocessed - /// data and a flag, `eof`, that reports whether the Reader has no more data - /// to give. + /// data. /// /// If the returned error is non-nil, scanning stops and the error /// is returned to the client. /// - /// The function is never called with an empty data slice unless at EOF. - /// If `eof` is true, however, data may be non-empty and, - /// as always, holds unprocessed text. + /// The function is never called with an empty data slice. fn split<'input>( &mut self, data: &'input [u8], - eof: bool, ) -> SplitResult<'input, Self::TokenType, Self::Error>; } @@ -209,9 +41,9 @@ pub trait Splitter: Sized { /// Scanning stops unrecoverably at EOF, the first I/O error, or a token too /// large to fit in the buffer. When a scan stops, the reader may have /// advanced arbitrarily far past the last token. -pub struct Scanner { +pub struct Scanner<'input, S: Splitter> { /// The reader provided by the client. - input: I, + input: &'input [u8], /// The function to tokenize the input. splitter: S, /// current line number @@ -220,8 +52,8 @@ pub struct Scanner { column: usize, } -impl Scanner { - pub fn new(input: I, splitter: S) -> Scanner { +impl<'input, S: Splitter> Scanner<'input, S> { + pub fn new(input: &'input [u8], splitter: S) -> Scanner<'input, S> { Scanner { input, splitter, @@ -245,7 +77,7 @@ impl Scanner { } /// Reset the scanner such that it behaves as if it had never been used. - pub fn reset(&mut self, input: I) { + pub fn reset(&mut self, input: &'input [u8]) { self.input = input; self.line = 1; self.column = 1; @@ -254,22 +86,19 @@ impl Scanner { type ScanResult<'input, TokenType, Error> = Result, Error>; -impl Scanner { +impl<'input, S: Splitter> Scanner<'input, S> { /// Advance the Scanner to next token. /// Return the token as a byte slice. /// Return `None` when the end of the input is reached. /// Return any error that occurs while reading the input. pub fn scan(&mut self) -> ScanResult<'_, S::TokenType, S::Error> { - use std::mem; debug!(target: "scanner", "scan(line: {}, column: {})", self.line, self.column); // Loop until we have a token. loop { - let eof = self.input.eof(); // See if we can get a token with what we already have. - if !self.input.is_empty() || eof { - // TODO: I don't know how to make the borrow checker happy! - let data = unsafe { mem::transmute(self.input.buffer()) }; - match self.splitter.split(data, eof) { + if !self.input.is_empty() { + let data = self.input; + match self.splitter.split(data) { Err(mut e) => { e.position(self.line, self.column); return Err(e); @@ -289,13 +118,8 @@ impl Scanner { } } // We cannot generate a token with what we are holding. - // If we've already hit EOF, we are done. - if eof { - // Shut it down. - return Ok(None); - } - // Must read more data. - self.input.fill_buf()?; + // we are done. + return Ok(None); } } @@ -303,7 +127,7 @@ impl Scanner { fn consume(&mut self, amt: usize) { debug!(target: "scanner", "consume({})", amt); debug_assert!(amt <= self.input.len()); - for byte in &self.input.buffer()[..amt] { + for byte in &self.input[..amt] { if *byte == b'\n' { self.line += 1; self.column = 1; @@ -311,11 +135,11 @@ impl Scanner { self.column += 1; } } - self.input.consume(amt); + self.input = &self.input[amt..]; } } -impl fmt::Debug for Scanner { +impl<'input, S: Splitter> fmt::Debug for Scanner<'input, S> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Scanner") .field("input", &self.input) diff --git a/src/lexer/sql/mod.rs b/src/lexer/sql/mod.rs index fcc69bc..90f5465 100644 --- a/src/lexer/sql/mod.rs +++ b/src/lexer/sql/mod.rs @@ -18,22 +18,22 @@ mod test; use crate::lexer::scan::ScanError; use crate::lexer::scan::Splitter; -use crate::lexer::{Input, Scanner}; +use crate::lexer::Scanner; pub use crate::parser::ParserError; pub use error::Error; // TODO Extract scanning stuff and move this into the parser crate // to make possible to use the tokenizer without depending on the parser... -pub struct Parser { - scanner: Scanner, +pub struct Parser<'input> { + scanner: Scanner<'input, Tokenizer>, parser: yyParser, buffer: Vec, lookahead: VecDeque<(TokenType, String)>, } -impl Parser { - pub fn new(input: I) -> Parser { +impl<'input> Parser<'input> { + pub fn new(input: &'input [u8]) -> Parser<'input> { let lexer = Tokenizer::new(); let scanner = Scanner::new(input, lexer); let ctx = Context::new(); @@ -48,7 +48,7 @@ impl Parser { } } - pub fn reset(&mut self, input: I) { + pub fn reset(&mut self, input: &'input [u8]) { self.scanner.reset(input); } @@ -158,7 +158,7 @@ macro_rules! try_with_position { }; } -impl FallibleIterator for Parser { +impl<'input> FallibleIterator for Parser<'input> { type Item = Cmd; type Error = Error; @@ -259,11 +259,7 @@ impl Splitter for Tokenizer { fn split<'input>( &mut self, data: &'input [u8], - eof: bool, ) -> Result<(Option>, usize), Error> { - if eof && data.is_empty() { - return Ok((None, 0)); - } if data[0].is_ascii_whitespace() { // eat as much space as possible return Ok(( @@ -274,35 +270,35 @@ impl Splitter for Tokenizer { }, )); } - match data[0] { + return match data[0] { b'-' => { if let Some(b) = data.get(1) { if *b == b'-' { // eat comment if let Some(i) = memchr(b'\n', data) { - return Ok((None, i + 1)); - } else if eof { - return Ok((None, data.len())); - } // else ask more data until '\n' + Ok((None, i + 1)) + } else { + Ok((None, data.len())) + } } else if *b == b'>' { if let Some(b) = data.get(2) { if *b == b'>' { return Ok((Some((&data[..3], TK_PTR)), 3)); } } - return Ok((Some((&data[..2], TK_PTR)), 2)); + Ok((Some((&data[..2], TK_PTR)), 2)) } else { - return Ok((Some((&data[..1], TK_MINUS)), 1)); + Ok((Some((&data[..1], TK_MINUS)), 1)) } - } else if eof { - return Ok((Some((&data[..1], TK_MINUS)), 1)); - } // else ask more data + } else { + Ok((Some((&data[..1], TK_MINUS)), 1)) + } } - b'(' => return Ok((Some((&data[..1], TK_LP)), 1)), - b')' => return Ok((Some((&data[..1], TK_RP)), 1)), - b';' => return Ok((Some((&data[..1], TK_SEMI)), 1)), - b'+' => return Ok((Some((&data[..1], TK_PLUS)), 1)), - b'*' => return Ok((Some((&data[..1], TK_STAR)), 1)), + b'(' => Ok((Some((&data[..1], TK_LP)), 1)), + b')' => Ok((Some((&data[..1], TK_RP)), 1)), + b';' => Ok((Some((&data[..1], TK_SEMI)), 1)), + b'+' => Ok((Some((&data[..1], TK_PLUS)), 1)), + b'*' => Ok((Some((&data[..1], TK_STAR)), 1)), b'/' => { if let Some(b) = data.get(1) { if *b == b'*' { @@ -317,111 +313,106 @@ impl Splitter for Tokenizer { pb = *b; } if let Some(i) = end { - return Ok((None, i + 1)); - } else if eof { - return Err(Error::UnterminatedBlockComment(None)); - } // else ask more data until '*/' + Ok((None, i + 1)) + } else { + Err(Error::UnterminatedBlockComment(None)) + } } else { - return Ok((Some((&data[..1], TK_SLASH)), 1)); + Ok((Some((&data[..1], TK_SLASH)), 1)) } - } else if eof { - return Ok((Some((&data[..1], TK_SLASH)), 1)); + } else { + Ok((Some((&data[..1], TK_SLASH)), 1)) } } - b'%' => return Ok((Some((&data[..1], TK_REM)), 1)), + b'%' => Ok((Some((&data[..1], TK_REM)), 1)), b'=' => { if let Some(b) = data.get(1) { - return Ok(if *b == b'=' { + Ok(if *b == b'=' { (Some((&data[..2], TK_EQ)), 2) } else { (Some((&data[..1], TK_EQ)), 1) - }); - } else if eof { - return Ok((Some((&data[..1], TK_EQ)), 1)); - } // else ask more data to fuse '==' or not + }) + } else { + Ok((Some((&data[..1], TK_EQ)), 1)) + } } b'<' => { if let Some(b) = data.get(1) { - return Ok(match *b { + Ok(match *b { b'=' => (Some((&data[..2], TK_LE)), 2), b'>' => (Some((&data[..2], TK_NE)), 2), b'<' => (Some((&data[..2], TK_LSHIFT)), 2), _ => (Some((&data[..1], TK_LT)), 1), - }); - } else if eof { - return Ok((Some((&data[..1], TK_LT)), 1)); - } // else ask more data + }) + } else { + Ok((Some((&data[..1], TK_LT)), 1)) + } } b'>' => { if let Some(b) = data.get(1) { - return Ok(match *b { + Ok(match *b { b'=' => (Some((&data[..2], TK_GE)), 2), b'>' => (Some((&data[..2], TK_RSHIFT)), 2), _ => (Some((&data[..1], TK_GT)), 1), - }); - } else if eof { - return Ok((Some((&data[..1], TK_GT)), 1)); - } // else ask more data + }) + } else { + Ok((Some((&data[..1], TK_GT)), 1)) + } } b'!' => { if let Some(b) = data.get(1) { - return if *b == b'=' { + if *b == b'=' { Ok((Some((&data[..2], TK_NE)), 2)) } else { Err(Error::ExpectedEqualsSign(None)) - }; - } else if eof { - return Err(Error::ExpectedEqualsSign(None)); - } // else ask more data + } + } else { + Err(Error::ExpectedEqualsSign(None)) + } } b'|' => { if let Some(b) = data.get(1) { - return Ok(if *b == b'|' { + Ok(if *b == b'|' { (Some((&data[..2], TK_CONCAT)), 2) } else { (Some((&data[..1], TK_BITOR)), 1) - }); - } else if eof { - return Ok((Some((&data[..1], TK_BITOR)), 1)); - } // else ask more data + }) + } else { + Ok((Some((&data[..1], TK_BITOR)), 1)) + } } - b',' => return Ok((Some((&data[..1], TK_COMMA)), 1)), - b'&' => return Ok((Some((&data[..1], TK_BITAND)), 1)), - b'~' => return Ok((Some((&data[..1], TK_BITNOT)), 1)), - quote @ b'`' | quote @ b'\'' | quote @ b'"' => return literal(data, eof, quote), + b',' => Ok((Some((&data[..1], TK_COMMA)), 1)), + b'&' => Ok((Some((&data[..1], TK_BITAND)), 1)), + b'~' => Ok((Some((&data[..1], TK_BITNOT)), 1)), + quote @ b'`' | quote @ b'\'' | quote @ b'"' => literal(data, quote), b'.' => { if let Some(b) = data.get(1) { if b.is_ascii_digit() { - return fractional_part(data, eof, 0); - } else if eof { - return Ok((Some((&data[..1], TK_DOT)), 1)); + fractional_part(data, 0) + } else { + Ok((Some((&data[..1], TK_DOT)), 1)) } - } else if eof { - return Ok((Some((&data[..1], TK_DOT)), 1)); - } // else ask more data + } else { + Ok((Some((&data[..1], TK_DOT)), 1)) + } } - b'0'..=b'9' => return number(data, eof), + b'0'..=b'9' => number(data), b'[' => { if let Some(i) = memchr(b']', data) { // Keep original quotes / '[' ... ’]' - return Ok((Some((&data[0..i + 1], TK_ID)), i + 1)); - } else if eof { - return Err(Error::UnterminatedBracket(None)); - } // else ask more data until ']' + Ok((Some((&data[0..i + 1], TK_ID)), i + 1)) + } else { + Err(Error::UnterminatedBracket(None)) + } } b'?' => { match data.iter().skip(1).position(|&b| !b.is_ascii_digit()) { Some(i) => { // do not include the '?' in the token - return Ok((Some((&data[1..=i], TK_VARIABLE)), i + 1)); - } - None if eof => { - return Ok((Some((&data[1..], TK_VARIABLE)), data.len())); + Ok((Some((&data[1..=i], TK_VARIABLE)), i + 1)) } - _ => { - // else ask more data - } - }; + None => Ok((Some((&data[1..], TK_VARIABLE)), data.len())), + } } b'$' | b'@' | b'#' | b':' => { match data @@ -429,41 +420,36 @@ impl Splitter for Tokenizer { .skip(1) .position(|&b| !is_identifier_continue(b)) { - Some(0) => return Err(Error::BadVariableName(None)), + Some(0) => Err(Error::BadVariableName(None)), Some(i) => { // '$' is included as part of the name - return Ok((Some((&data[..=i], TK_VARIABLE)), i + 1)); + Ok((Some((&data[..=i], TK_VARIABLE)), i + 1)) } - None if eof => { + None => { if data.len() == 1 { return Err(Error::BadVariableName(None)); } - return Ok((Some((data, TK_VARIABLE)), data.len())); + Ok((Some((data, TK_VARIABLE)), data.len())) } - _ => { - // else ask more data - } - }; + } } b if is_identifier_start(b) => { - return if b == b'x' || b == b'X' { + if b == b'x' || b == b'X' { if let Some(&b'\'') = data.get(1) { - blob_literal(data, eof) + blob_literal(data) } else { - Ok(self.identifierish(data, eof)) + Ok(self.identifierish(data)) } } else { - Ok(self.identifierish(data, eof)) + Ok(self.identifierish(data)) } } - _ => return Err(Error::UnrecognizedToken(None)), + _ => Err(Error::UnrecognizedToken(None)), }; - // Request more data. - Ok((None, 0)) } } -fn literal(data: &[u8], eof: bool, quote: u8) -> Result<(Option>, usize), Error> { +fn literal(data: &[u8], quote: u8) -> Result<(Option>, usize), Error> { debug_assert_eq!(data[0], quote); let tt = if quote == b'\'' { TK_STRING } else { TK_ID }; let mut pb = 0; @@ -482,24 +468,22 @@ fn literal(data: &[u8], eof: bool, quote: u8) -> Result<(Option>, usiz } pb = *b; } - if end.is_some() || (eof && pb == quote) { + if end.is_some() || pb == quote { let i = match end { Some(i) => i, _ => data.len(), }; // keep original quotes in the token - return Ok((Some((&data[0..i], tt)), i)); - } else if eof { - return Err(Error::UnterminatedLiteral(None)); + Ok((Some((&data[0..i], tt)), i)) + } else { + Err(Error::UnterminatedLiteral(None)) } - // else ask more data until closing quote - Ok((None, 0)) } -fn blob_literal(data: &[u8], eof: bool) -> Result<(Option>, usize), Error> { +fn blob_literal(data: &[u8]) -> Result<(Option>, usize), Error> { debug_assert!(data[0] == b'x' || data[0] == b'X'); debug_assert_eq!(data[1], b'\''); - if let Some((i, b)) = data + return if let Some((i, b)) = data .iter() .enumerate() .skip(2) @@ -508,53 +492,46 @@ fn blob_literal(data: &[u8], eof: bool) -> Result<(Option>, usize), Er if *b != b'\'' || i % 2 != 0 { return Err(Error::MalformedBlobLiteral(None)); } - return Ok((Some((&data[2..i], TK_BLOB)), i + 1)); - } else if eof { - return Err(Error::MalformedBlobLiteral(None)); - } - // else ask more data - Ok((None, 0)) + Ok((Some((&data[2..i], TK_BLOB)), i + 1)) + } else { + Err(Error::MalformedBlobLiteral(None)) + }; } -fn number(data: &[u8], eof: bool) -> Result<(Option>, usize), Error> { +fn number(data: &[u8]) -> Result<(Option>, usize), Error> { debug_assert!(data[0].is_ascii_digit()); if data[0] == b'0' { if let Some(b) = data.get(1) { if *b == b'x' || *b == b'X' { - return hex_integer(data, eof); + return hex_integer(data); } - } else if eof { - return Ok((Some((data, TK_INTEGER)), data.len())); } else { - // ask more data - return Ok((None, 0)); + return Ok((Some((data, TK_INTEGER)), data.len())); } } - if let Some((i, b)) = data + return if let Some((i, b)) = data .iter() .enumerate() .skip(1) .find(|&(_, &b)| !b.is_ascii_digit()) { if *b == b'.' { - return fractional_part(data, eof, i); + return fractional_part(data, i); } else if *b == b'e' || *b == b'E' { - return exponential_part(data, eof, i); + return exponential_part(data, i); } else if is_identifier_start(*b) { return Err(Error::BadNumber(None)); } - return Ok((Some((&data[..i], TK_INTEGER)), i)); - } else if eof { - return Ok((Some((data, TK_INTEGER)), data.len())); - } - // else ask more data - Ok((None, 0)) + Ok((Some((&data[..i], TK_INTEGER)), i)) + } else { + Ok((Some((data, TK_INTEGER)), data.len())) + }; } -fn hex_integer(data: &[u8], eof: bool) -> Result<(Option>, usize), Error> { +fn hex_integer(data: &[u8]) -> Result<(Option>, usize), Error> { debug_assert_eq!(data[0], b'0'); debug_assert!(data[1] == b'x' || data[1] == b'X'); - if let Some((i, b)) = data + return if let Some((i, b)) = data .iter() .enumerate() .skip(2) @@ -564,43 +541,39 @@ fn hex_integer(data: &[u8], eof: bool) -> Result<(Option>, usize), Err if i == 2 || is_identifier_start(*b) { return Err(Error::MalformedHexInteger(None)); } - return Ok((Some((&data[..i], TK_INTEGER)), i)); - } else if eof { + Ok((Some((&data[..i], TK_INTEGER)), i)) + } else { // Must not be empty (Ox is invalid) if data.len() == 2 { return Err(Error::MalformedHexInteger(None)); } - return Ok((Some((data, TK_INTEGER)), data.len())); - } - // else ask more data - Ok((None, 0)) + Ok((Some((data, TK_INTEGER)), data.len())) + }; } -fn fractional_part(data: &[u8], eof: bool, i: usize) -> Result<(Option>, usize), Error> { +fn fractional_part(data: &[u8], i: usize) -> Result<(Option>, usize), Error> { debug_assert_eq!(data[i], b'.'); - if let Some((i, b)) = data + return if let Some((i, b)) = data .iter() .enumerate() .skip(i + 1) .find(|&(_, &b)| !b.is_ascii_digit()) { if *b == b'e' || *b == b'E' { - return exponential_part(data, eof, i); + return exponential_part(data, i); } else if is_identifier_start(*b) { return Err(Error::BadNumber(None)); } - return Ok((Some((&data[..i], TK_FLOAT)), i)); - } else if eof { - return Ok((Some((data, TK_FLOAT)), data.len())); - } - // else ask more data - Ok((None, 0)) + Ok((Some((&data[..i], TK_FLOAT)), i)) + } else { + Ok((Some((data, TK_FLOAT)), data.len())) + }; } -fn exponential_part(data: &[u8], eof: bool, i: usize) -> Result<(Option>, usize), Error> { +fn exponential_part(data: &[u8], i: usize) -> Result<(Option>, usize), Error> { debug_assert!(data[i] == b'e' || data[i] == b'E'); // data[i] == 'e'|'E' - if let Some(b) = data.get(i + 1) { + return if let Some(b) = data.get(i + 1) { let i = if *b == b'+' || *b == b'-' { i + 1 } else { i }; if let Some((i, b)) = data .iter() @@ -611,47 +584,37 @@ fn exponential_part(data: &[u8], eof: bool, i: usize) -> Result<(Option( - &mut self, - data: &'input [u8], - eof: bool, - ) -> (Option>, usize) { + fn identifierish<'input>(&mut self, data: &'input [u8]) -> (Option>, usize) { debug_assert!(is_identifier_start(data[0])); // data[0] is_identifier_start => skip(1) let end = data .iter() .skip(1) .position(|&b| !is_identifier_continue(b)); - if end.is_some() || eof { - let i = match end { - Some(i) => i + 1, - _ => data.len(), - }; - let word = &data[..i]; - let tt = if word.len() >= 2 && word.len() <= MAX_KEYWORD_LEN && word.is_ascii() { - keyword_token(word).unwrap_or(TK_ID) - } else { - TK_ID - }; - return (Some((word, tt)), i); - } - // else ask more data - (None, 0) + let i = match end { + Some(i) => i + 1, + _ => data.len(), + }; + let word = &data[..i]; + let tt = if word.len() >= 2 && word.len() <= MAX_KEYWORD_LEN && word.is_ascii() { + keyword_token(word).unwrap_or(TK_ID) + } else { + TK_ID + }; + (Some((word, tt)), i) } } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 2b541ff..88620ff 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -36,7 +36,7 @@ impl std::fmt::Display for ParserError { write!(f, "near {}, \"{:?}\": syntax error", token_type, found) } ParserError::UnexpectedEof => f.write_str("unexpected end of input"), - ParserError::Custom(s) => f.write_str(&s), + ParserError::Custom(s) => f.write_str(s), } } } From a74fdc957a96b17803d50fe306fcd2716468fcb0 Mon Sep 17 00:00:00 2001 From: gwenn Date: Thu, 9 Mar 2023 19:45:17 +0100 Subject: [PATCH 2/3] Keep input untouched, used an offset instead --- src/lexer/scan.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/lexer/scan.rs b/src/lexer/scan.rs index 985f8be..058a525 100644 --- a/src/lexer/scan.rs +++ b/src/lexer/scan.rs @@ -44,6 +44,8 @@ pub trait Splitter: Sized { pub struct Scanner<'input, S: Splitter> { /// The reader provided by the client. input: &'input [u8], + /// offset in `input` + offset: usize, /// The function to tokenize the input. splitter: S, /// current line number @@ -56,6 +58,7 @@ impl<'input, S: Splitter> Scanner<'input, S> { pub fn new(input: &'input [u8], splitter: S) -> Scanner<'input, S> { Scanner { input, + offset: 0, splitter, line: 1, column: 1, @@ -79,6 +82,7 @@ impl<'input, S: Splitter> Scanner<'input, S> { /// Reset the scanner such that it behaves as if it had never been used. pub fn reset(&mut self, input: &'input [u8]) { self.input = input; + self.offset = 0; self.line = 1; self.column = 1; } @@ -96,8 +100,8 @@ impl<'input, S: Splitter> Scanner<'input, S> { // Loop until we have a token. loop { // See if we can get a token with what we already have. - if !self.input.is_empty() { - let data = self.input; + if self.offset < self.input.len() { + let data = &self.input[self.offset..]; match self.splitter.split(data) { Err(mut e) => { e.position(self.line, self.column); @@ -126,8 +130,9 @@ impl<'input, S: Splitter> Scanner<'input, S> { /// Consume `amt` bytes of the buffer. fn consume(&mut self, amt: usize) { debug!(target: "scanner", "consume({})", amt); - debug_assert!(amt <= self.input.len()); - for byte in &self.input[..amt] { + let data = &self.input[self.offset..]; + debug_assert!(amt <= data.len()); + for byte in &data[..amt] { if *byte == b'\n' { self.line += 1; self.column = 1; @@ -135,7 +140,7 @@ impl<'input, S: Splitter> Scanner<'input, S> { self.column += 1; } } - self.input = &self.input[amt..]; + self.offset += amt; } } @@ -143,6 +148,7 @@ impl<'input, S: Splitter> fmt::Debug for Scanner<'input, S> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Scanner") .field("input", &self.input) + .field("offset", &self.offset) .field("line", &self.line) .field("column", &self.column) .finish() From 611b94f2fdcb5f7bd9c7fe960d33f167fb30c195 Mon Sep 17 00:00:00 2001 From: gwenn Date: Sat, 11 Mar 2023 18:02:27 +0100 Subject: [PATCH 3/3] Simplify lookhead code Fix #4 --- examples/sql_tokens.rs | 4 +- src/lexer/scan.rs | 46 +++++---- src/lexer/sql/mod.rs | 222 ++++++++++++++++++++--------------------- 3 files changed, 138 insertions(+), 134 deletions(-) diff --git a/examples/sql_tokens.rs b/examples/sql_tokens.rs index c9834c7..63420ce 100644 --- a/examples/sql_tokens.rs +++ b/examples/sql_tokens.rs @@ -13,9 +13,9 @@ fn main() { for arg in args.skip(1) { let input = read(arg.clone()).unwrap(); let tokenizer = Tokenizer::new(); - let mut s = Scanner::new(&input, tokenizer); + let mut s = Scanner::new(tokenizer); loop { - match s.scan() { + match s.scan(&input) { Ok(None) => break, Err(err) => { //eprintln!("{} at line: {}, column: {}", err, s.line(), s.column()); diff --git a/src/lexer/scan.rs b/src/lexer/scan.rs index 058a525..b8932e2 100644 --- a/src/lexer/scan.rs +++ b/src/lexer/scan.rs @@ -41,11 +41,11 @@ pub trait Splitter: Sized { /// Scanning stops unrecoverably at EOF, the first I/O error, or a token too /// large to fit in the buffer. When a scan stops, the reader may have /// advanced arbitrarily far past the last token. -pub struct Scanner<'input, S: Splitter> { - /// The reader provided by the client. - input: &'input [u8], +pub struct Scanner { /// offset in `input` offset: usize, + /// mark + mark: (usize, u64, usize), /// The function to tokenize the input. splitter: S, /// current line number @@ -54,11 +54,11 @@ pub struct Scanner<'input, S: Splitter> { column: usize, } -impl<'input, S: Splitter> Scanner<'input, S> { - pub fn new(input: &'input [u8], splitter: S) -> Scanner<'input, S> { +impl Scanner { + pub fn new(splitter: S) -> Scanner { Scanner { - input, offset: 0, + mark: (0, 0, 0), splitter, line: 1, column: 1, @@ -79,9 +79,15 @@ impl<'input, S: Splitter> Scanner<'input, S> { &self.splitter } + pub fn mark(&mut self) { + self.mark = (self.offset, self.line, self.column); + } + pub fn reset_to_mark(&mut self) { + (self.offset, self.line, self.column) = self.mark; + } + /// Reset the scanner such that it behaves as if it had never been used. - pub fn reset(&mut self, input: &'input [u8]) { - self.input = input; + pub fn reset(&mut self) { self.offset = 0; self.line = 1; self.column = 1; @@ -90,33 +96,36 @@ impl<'input, S: Splitter> Scanner<'input, S> { type ScanResult<'input, TokenType, Error> = Result, Error>; -impl<'input, S: Splitter> Scanner<'input, S> { +impl Scanner { /// Advance the Scanner to next token. /// Return the token as a byte slice. /// Return `None` when the end of the input is reached. /// Return any error that occurs while reading the input. - pub fn scan(&mut self) -> ScanResult<'_, S::TokenType, S::Error> { + pub fn scan<'input>( + &mut self, + input: &'input [u8], + ) -> ScanResult<'input, S::TokenType, S::Error> { debug!(target: "scanner", "scan(line: {}, column: {})", self.line, self.column); // Loop until we have a token. loop { // See if we can get a token with what we already have. - if self.offset < self.input.len() { - let data = &self.input[self.offset..]; + if self.offset < input.len() { + let data = &input[self.offset..]; match self.splitter.split(data) { Err(mut e) => { e.position(self.line, self.column); return Err(e); } Ok((None, 0)) => { - // Request more data + // Done } Ok((None, amt)) => { // Ignore/skip this data - self.consume(amt); + self.consume(data, amt); continue; } Ok((tok, amt)) => { - self.consume(amt); + self.consume(data, amt); return Ok(tok); } } @@ -128,9 +137,8 @@ impl<'input, S: Splitter> Scanner<'input, S> { } /// Consume `amt` bytes of the buffer. - fn consume(&mut self, amt: usize) { + fn consume(&mut self, data: &[u8], amt: usize) { debug!(target: "scanner", "consume({})", amt); - let data = &self.input[self.offset..]; debug_assert!(amt <= data.len()); for byte in &data[..amt] { if *byte == b'\n' { @@ -144,11 +152,11 @@ impl<'input, S: Splitter> Scanner<'input, S> { } } -impl<'input, S: Splitter> fmt::Debug for Scanner<'input, S> { +impl fmt::Debug for Scanner { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Scanner") - .field("input", &self.input) .field("offset", &self.offset) + .field("mark", &self.mark) .field("line", &self.line) .field("column", &self.column) .finish() diff --git a/src/lexer/sql/mod.rs b/src/lexer/sql/mod.rs index 90f5465..5b84ddd 100644 --- a/src/lexer/sql/mod.rs +++ b/src/lexer/sql/mod.rs @@ -1,13 +1,10 @@ //! Adaptation/port of [`SQLite` tokenizer](http://www.sqlite.org/src/artifact?ci=trunk&filename=src/tokenize.c) use fallible_iterator::FallibleIterator; use memchr::memchr; -use std::collections::VecDeque; pub use crate::dialect::TokenType; pub use crate::dialect::TokenType::*; -use crate::dialect::{ - from_bytes, is_identifier_continue, is_identifier_start, keyword_token, MAX_KEYWORD_LEN, -}; +use crate::dialect::{is_identifier_continue, is_identifier_start, keyword_token, MAX_KEYWORD_LEN}; use crate::parser::ast::Cmd; use crate::parser::parse::{yyParser, YYCODETYPE}; use crate::parser::Context; @@ -26,30 +23,27 @@ pub use error::Error; // to make possible to use the tokenizer without depending on the parser... pub struct Parser<'input> { - scanner: Scanner<'input, Tokenizer>, + input: &'input [u8], + scanner: Scanner, parser: yyParser, - buffer: Vec, - lookahead: VecDeque<(TokenType, String)>, } impl<'input> Parser<'input> { pub fn new(input: &'input [u8]) -> Parser<'input> { let lexer = Tokenizer::new(); - let scanner = Scanner::new(input, lexer); + let scanner = Scanner::new(lexer); let ctx = Context::new(); let parser = yyParser::new(ctx); - let buffer = Vec::new(); - let lookahead = VecDeque::new(); Parser { + input, scanner, parser, - buffer, - lookahead, } } pub fn reset(&mut self, input: &'input [u8]) { - self.scanner.reset(input); + self.input = input; + self.scanner.reset(); } pub fn line(&self) -> u64 { @@ -58,91 +52,99 @@ impl<'input> Parser<'input> { pub fn column(&self) -> usize { self.scanner.column() } +} - /* - ** Return the id of the next token in input. - */ - fn get_token(&mut self, i: usize) -> Result { - let mut t = if let Some((token_type, _)) = self.lookahead.get(i) { - *token_type - } else { - let (value, token_type) = match self.scanner.scan()? { - None => { - return Ok(TK_EOF); - } - Some(tuple) => tuple, - }; - self.lookahead.push_back((token_type, from_bytes(value))); - token_type +/* + ** Return the id of the next token in input. + */ +fn get_token(scanner: &mut Scanner, input: &[u8]) -> Result { + let mut t = { + let (_, token_type) = match scanner.scan(input)? { + None => { + return Ok(TK_EOF); + } + Some(tuple) => tuple, }; - if t == TK_ID - || t == TK_STRING - || t == TK_JOIN_KW - || t == TK_WINDOW - || t == TK_OVER - || yyParser::parse_fallback(t as YYCODETYPE) == TK_ID as YYCODETYPE - { - t = TK_ID; - } - Ok(t) + token_type + }; + if t == TK_ID + || t == TK_STRING + || t == TK_JOIN_KW + || t == TK_WINDOW + || t == TK_OVER + || yyParser::parse_fallback(t as YYCODETYPE) == TK_ID as YYCODETYPE + { + t = TK_ID; } + Ok(t) +} - /* - ** The following three functions are called immediately after the tokenizer - ** reads the keywords WINDOW, OVER and FILTER, respectively, to determine - ** whether the token should be treated as a keyword or an SQL identifier. - ** This cannot be handled by the usual lemon %fallback method, due to - ** the ambiguity in some constructions. e.g. - ** - ** SELECT sum(x) OVER ... - ** - ** In the above, "OVER" might be a keyword, or it might be an alias for the - ** sum(x) expression. If a "%fallback ID OVER" directive were added to - ** grammar, then SQLite would always treat "OVER" as an alias, making it - ** impossible to call a window-function without a FILTER clause. - ** - ** WINDOW is treated as a keyword if: - ** - ** * the following token is an identifier, or a keyword that can fallback - ** to being an identifier, and - ** * the token after than one is TK_AS. - ** - ** OVER is a keyword if: - ** - ** * the previous token was TK_RP, and - ** * the next token is either TK_LP or an identifier. - ** - ** FILTER is a keyword if: - ** - ** * the previous token was TK_RP, and - ** * the next token is TK_LP. - */ - fn analyze_window_keyword(&mut self) -> Result { - let t = self.get_token(0)?; - if t != TK_ID { - return Ok(TK_ID); - }; - let t = self.get_token(1)?; - if t != TK_AS { - return Ok(TK_ID); - }; - Ok(TK_WINDOW) - } - fn analyze_over_keyword(&mut self, last_token: TokenType) -> Result { - if last_token == TK_RP { - let t = self.get_token(0)?; - if t == TK_LP || t == TK_ID { - return Ok(TK_OVER); - } +/* + ** The following three functions are called immediately after the tokenizer + ** reads the keywords WINDOW, OVER and FILTER, respectively, to determine + ** whether the token should be treated as a keyword or an SQL identifier. + ** This cannot be handled by the usual lemon %fallback method, due to + ** the ambiguity in some constructions. e.g. + ** + ** SELECT sum(x) OVER ... + ** + ** In the above, "OVER" might be a keyword, or it might be an alias for the + ** sum(x) expression. If a "%fallback ID OVER" directive were added to + ** grammar, then SQLite would always treat "OVER" as an alias, making it + ** impossible to call a window-function without a FILTER clause. + ** + ** WINDOW is treated as a keyword if: + ** + ** * the following token is an identifier, or a keyword that can fallback + ** to being an identifier, and + ** * the token after than one is TK_AS. + ** + ** OVER is a keyword if: + ** + ** * the previous token was TK_RP, and + ** * the next token is either TK_LP or an identifier. + ** + ** FILTER is a keyword if: + ** + ** * the previous token was TK_RP, and + ** * the next token is TK_LP. + */ +fn analyze_window_keyword( + scanner: &mut Scanner, + input: &[u8], +) -> Result { + let t = get_token(scanner, input)?; + if t != TK_ID { + return Ok(TK_ID); + }; + let t = get_token(scanner, input)?; + if t != TK_AS { + return Ok(TK_ID); + }; + Ok(TK_WINDOW) +} +fn analyze_over_keyword( + scanner: &mut Scanner, + input: &[u8], + last_token: TokenType, +) -> Result { + if last_token == TK_RP { + let t = get_token(scanner, input)?; + if t == TK_LP || t == TK_ID { + return Ok(TK_OVER); } - Ok(TK_ID) } - fn analyze_filter_keyword(&mut self, last_token: TokenType) -> Result { - if last_token == TK_RP && self.get_token(1)? == TK_LP { - return Ok(TK_FILTER); - } - Ok(TK_ID) + Ok(TK_ID) +} +fn analyze_filter_keyword( + scanner: &mut Scanner, + input: &[u8], + last_token: TokenType, +) -> Result { + if last_token == TK_RP && get_token(scanner, input)? == TK_LP { + return Ok(TK_FILTER); } + Ok(TK_ID) } macro_rules! try_with_position { @@ -168,34 +170,29 @@ impl<'input> FallibleIterator for Parser<'input> { let mut last_token_parsed = TK_EOF; let mut eof = false; loop { - let lookahead = self.lookahead.pop_front(); - let (value, mut token_type) = if let Some((token_type, ref value)) = lookahead { - (value.as_bytes(), token_type) - } else { - match self.scanner.scan()? { - None => { - eof = true; - break; - } - Some(tuple) => tuple, + let (value, mut token_type) = match self.scanner.scan(self.input)? { + None => { + eof = true; + break; } + Some(tuple) => tuple, }; let token = if token_type >= TK_WINDOW { debug_assert!( token_type == TK_OVER || token_type == TK_FILTER || token_type == TK_WINDOW ); - self.buffer.extend_from_slice(value); - + self.scanner.mark(); if token_type == TK_WINDOW { - token_type = self.analyze_window_keyword()?; + token_type = analyze_window_keyword(&mut self.scanner, self.input)?; } else if token_type == TK_OVER { - token_type = self.analyze_over_keyword(last_token_parsed)?; + token_type = + analyze_over_keyword(&mut self.scanner, self.input, last_token_parsed)?; } else if token_type == TK_FILTER { - token_type = self.analyze_filter_keyword(last_token_parsed)?; + token_type = + analyze_filter_keyword(&mut self.scanner, self.input, last_token_parsed)?; } - let token = token_type.to_token(self.buffer.as_slice()); - self.buffer.clear(); - token + self.scanner.reset_to_mark(); + token_type.to_token(value) } else { token_type.to_token(value) }; @@ -207,7 +204,6 @@ impl<'input> FallibleIterator for Parser<'input> { break; } } - self.lookahead.clear(); if last_token_parsed == TK_EOF { return Ok(None); // empty input } @@ -628,11 +624,11 @@ mod tests { fn fallible_iterator() { let tokenizer = Tokenizer::new(); let input = "PRAGMA parser_trace=ON;".as_bytes(); - let mut s = Scanner::new(input, tokenizer); - let (token1, token_type1) = s.scan().unwrap().unwrap(); + let mut s = Scanner::new(tokenizer); + let (token1, token_type1) = s.scan(input).unwrap().unwrap(); assert!(b"PRAGMA".eq_ignore_ascii_case(token1)); assert_eq!(TokenType::TK_PRAGMA, token_type1); - let (token2, token_type2) = s.scan().unwrap().unwrap(); + let (token2, token_type2) = s.scan(input).unwrap().unwrap(); assert_eq!("parser_trace".as_bytes(), token2); assert_eq!(TokenType::TK_ID, token_type2); }