gwenn · gwenn · Mar 15, 2024 · Jan 2, 2024 · Jan 6, 2024 · Jan 6, 2024
diff --git a/README.md b/README.md
@@ -38,7 +38,7 @@ TODO:
   - [ ] [If a keyword in double quotes is used in a context where it cannot be resolved to an identifier but where a string literal is allowed, then the token is understood to be a string literal instead of an identifier.](https://sqlite.org/lang_keywords.html)
   - [ ] Tests
   - [ ] Do not panic while parsing
-  - [ ] CREATE VIRTUAL TABLE args
+  - [x] CREATE VIRTUAL TABLE args
   - [ ] Zero copy (at least tokens)
 
 ### Unsupported by Rust
@@ -47,7 +47,7 @@ TODO:
 
 ## API change
 
-* No `ParseAlloc`/`ParseFree` anymore 
+* No `ParseAlloc`/`ParseFree` anymore
 
 ## Features not tested
 

diff --git a/checks.md b/checks.md
@@ -0,0 +1,71 @@
+# Extra consistency checks
+
+- `ALTER TABLE ... RENAME TO ...` when old and new table names are the same => `Stmt::check`
+- `ALTER TABLE ... ADD COLUMN ...` with new primary key / unique constraint => `Stmt::check`
+- `CREATE TABLE ...`
+  - with duplicated column name => `ColumnDefinition::add_column`
+  - with STRICT option and invalid or missing column type(s) => `CreateTableBody::check`
+  - WITHOUT ROWID and without primary key => `CreateTableBody::check`
+- `CREATE VIEW ... (...) ...`
+  - when view columns count does not match select columns count => `Stmt::check`
+  - with duplicated columns (same name) => `Stmt::check`
+- `DELETE FROM ... ORDER BY ...` with ORDER BY but without LIMIT => `Stmt::check`
+- `INSERT INTO ... (...) ...` when columns count does not match select columns / values count => `Stmt::check`
+- `INSERT INTO ... (...) DEFAULT VALUES` with columns and DEFAULT VALUES => `Stmt::check`
+- `SELECT ... EXCEPT|INTERSECT|UNION SELECT ...` when all SELECT does not have the same number of result columns => `SelectBody::push`
+- `NATURAL JOIN ...` with ON or USING clause => `FromClause::push`
+- `UPDATE ... ORDER BY ...` with ORDER BY but without LIMIT => `Stmt::check`
+- `VALUES (...), (...), ...` when all VALUES does not have the same number of terms => `OneSelect::push`
+- `WITH ...` with duplicated table name => `CommonTableExpr::add_cte`
+
+## TODO
+
+### `CREATE TABLE`
+- [X] qualified (different of `temp`) temporary table
+```sql
+sqlite> ATTACH DATABASE ':memory:' AS mem;
+sqlite> CREATE TEMPORARY TABLE mem.x AS SELECT 1;
+Parse error: temporary table name must be unqualified
+```
+```sql
+sqlite> CREATE TEMPORARY TABLE temp.x AS SELECT 1;
+-- OK
+```
+- [X] must have at least one non-generated column
+```sql
+sqlite> CREATE TABLE test(data AS (1));
+Parse error: must have at least one non-generated column
+```
+- [ ] column constraint(s) checks
+- [ ] table constraint(s) checks
+
+### `HAVING`
+- [X] HAVING clause on a non-aggregate query (`GroupBy::having`): grammar already prevents this case (grammar differs from SQLite official grammar).
+```sql
+sqlite> SELECT 1 as i HAVING i > 1;
+Parse error: HAVING clause on a non-aggregate query
+```
+vs
+```
+[ERROR sqlite3Parser] near HAVING, "Token(None)": syntax error
+Err: near HAVING, "None": syntax error at (1, 21) in SELECT 1 as i HAVING i > 1
+```
+
+### `SELECT ...`
+- [ ] no duplicated column name in `selcollist`/`Select::columns`
+```sql
+sqlite> SELECT 1 as i, 2 as i;
+-- no error (idem for postgres)
+```
+
+### `SELECT ... ORDER BY ...`
+- [ ] ORDER BY term does not match any column in the result set (`Select::order_by`)
+```sql
+sqlite> SELECT 1 as i ORDER BY j;
+Parse error: no such column: j
+  SELECT 1 as i ORDER BY j;
+                         ^--- error here
+```
+
+### `WITH`
+- [ ] no duplicated column name in `CommonTableExpr::IndexedColumn`
diff --git a/examples/sql_cmds.rs b/examples/sql_cmds.rs
@@ -3,7 +3,7 @@ use std::env;
 use std::fs::read;
 use std::panic;
 
-use sqlite3_parser::lexer::sql::Parser;
+use sqlite3_parser::lexer::sql::{Error, Parser};
 
 /// Parse specified files and print all commands.
 fn main() {
@@ -19,7 +19,10 @@ fn main() {
                     Ok(None) => break,
                     Err(err) => {
                         eprintln!("Err: {err} in {arg}");
-                        break;
+                        if let Error::ParserError(..) = err {
+                        } else {
+                            break;
+                        }
                     }
                     Ok(Some(cmd)) => {
                         println!("{cmd}");

diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs
@@ -15,9 +15,11 @@ pub(crate) fn sentinel(start: usize) -> Token {
 }
 
 impl Token {
+    /// Access token value
     pub fn unwrap(self) -> String {
         self.1.unwrap()
     }
+    /// Take token value
     pub fn take(&mut self) -> Self {
         Token(self.0, self.1.take(), self.2)
     }
@@ -64,6 +66,7 @@ fn from_bytes(bytes: &[u8]) -> String {
 include!(concat!(env!("OUT_DIR"), "/keywords.rs"));
 pub(crate) const MAX_KEYWORD_LEN: usize = 17;
 
+/// Check if `word` is a keyword
 pub fn keyword_token(word: &[u8]) -> Option<TokenType> {
     KEYWORDS
         .get(UncasedStr::new(unsafe { str::from_utf8_unchecked(word) }))
@@ -239,6 +242,7 @@ pub(crate) fn from_token(ty: u16, value: Token) -> String {
 }
 
 impl TokenType {
+    /// Return the associated string (mainly for testing)
     pub const fn as_str(&self) -> Option<&'static str> {
         use TokenType::*;
         match self {

diff --git a/src/dialect/token.rs b/src/dialect/token.rs
@@ -5,7 +5,7 @@
 // Renamed manually.
 // To be keep in sync.
 #[non_exhaustive]
-#[allow(non_camel_case_types)]
+#[allow(non_camel_case_types, missing_docs)]
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd)]
 #[repr(u16)]
 pub enum TokenType {

diff --git a/src/lexer/scan.rs b/src/lexer/scan.rs
@@ -6,7 +6,9 @@ use std::error::Error;
 use std::fmt;
 use std::io;
 
+/// Error with position
 pub trait ScanError: Error + From<io::Error> + Sized {
+    /// Update the position where the error occurs
     fn position(&mut self, line: u64, column: usize);
 }
 
@@ -17,8 +19,10 @@ type SplitResult<'input, TokenType, Error> =
 
 /// Split function used to tokenize the input
 pub trait Splitter: Sized {
+    /// Potential error raised
     type Error: ScanError;
     //type Item: ?Sized;
+    /// Token generated
     type TokenType;
 
     /// The arguments are an initial substring of the remaining unprocessed
@@ -55,6 +59,7 @@ pub struct Scanner<S: Splitter> {
 }
 
 impl<S: Splitter> Scanner<S> {
+    /// Constructor
     pub fn new(splitter: S) -> Scanner<S> {
         Scanner {
             offset: 0,
@@ -74,14 +79,15 @@ impl<S: Splitter> Scanner<S> {
     pub fn column(&self) -> usize {
         self.column
     }
-
+    /// Associated splitter
     pub fn splitter(&self) -> &S {
         &self.splitter
     }
-
+    /// Mark current position
     pub fn mark(&mut self) {
         self.mark = (self.offset, self.line, self.column);
     }
+    /// Reset to mark
     pub fn reset_to_mark(&mut self) {
         (self.offset, self.line, self.column) = self.mark;
     }

diff --git a/src/lexer/sql/error.rs b/src/lexer/sql/error.rs
@@ -5,20 +5,31 @@ use std::io;
 use crate::lexer::scan::ScanError;
 use crate::parser::ParserError;
 
+/// SQL lexer and parser errors
 #[non_exhaustive]
 #[derive(Debug)]
 pub enum Error {
     /// I/O Error
     Io(io::Error),
+    /// Lexer error
     UnrecognizedToken(Option<(u64, usize)>),
+    /// Missing quote or double-quote or backtick
     UnterminatedLiteral(Option<(u64, usize)>),
+    /// Missing `]`
     UnterminatedBracket(Option<(u64, usize)>),
+    /// Missing `*/`
     UnterminatedBlockComment(Option<(u64, usize)>),
+    /// Invalid parameter name
     BadVariableName(Option<(u64, usize)>),
+    /// Invalid number format
     BadNumber(Option<(u64, usize)>),
+    /// Invalid or missing sign after `!`
     ExpectedEqualsSign(Option<(u64, usize)>),
+    /// BLOB literals are string literals containing hexadecimal data and preceded by a single "x" or "X" character.
     MalformedBlobLiteral(Option<(u64, usize)>),
+    /// Hexadecimal integer literals follow the C-language notation of "0x" or "0X" followed by hexadecimal digits.
     MalformedHexInteger(Option<(u64, usize)>),
+    /// Grammar error
     ParserError(ParserError, Option<(u64, usize)>),
 }
 
@@ -45,7 +56,8 @@ impl fmt::Display for Error {
             Error::MalformedHexInteger(pos) => {
                 write!(f, "malformed hex integer at {:?}", pos.unwrap())
             }
-            Error::ParserError(ref msg, pos) => write!(f, "{} at {:?}", msg, pos.unwrap()),
+            Error::ParserError(ref msg, Some(pos)) => write!(f, "{} at {:?}", msg, pos),
+            Error::ParserError(ref msg, _) => write!(f, "{}", msg),
         }
     }
 }

diff --git a/src/lexer/sql/mod.rs b/src/lexer/sql/mod.rs
@@ -24,13 +24,15 @@ pub use error::Error;
 // TODO Extract scanning stuff and move this into the parser crate
 // to make possible to use the tokenizer without depending on the parser...
 
+/// SQL parser
 pub struct Parser<'input> {
     input: &'input [u8],
     scanner: Scanner<Tokenizer>,
     parser: yyParser<'input>,
 }
 
 impl<'input> Parser<'input> {
+    /// Constructor
     pub fn new(input: &'input [u8]) -> Parser<'input> {
         let lexer = Tokenizer::new();
         let scanner = Scanner::new(lexer);
@@ -42,15 +44,16 @@ impl<'input> Parser<'input> {
             parser,
         }
     }
-
+    /// Parse new `input`
     pub fn reset(&mut self, input: &'input [u8]) {
         self.input = input;
         self.scanner.reset();
     }
-
+    /// Current line position in input
     pub fn line(&self) -> u64 {
         self.scanner.line()
     }
+    /// Current column position in input
     pub fn column(&self) -> usize {
         self.scanner.column()
     }
@@ -231,30 +234,39 @@ impl<'input> FallibleIterator for Parser<'input> {
             return Err(err);
         }
         let cmd = self.parser.ctx.cmd();
+        if let Some(ref cmd) = cmd {
+            if let Err(e) = cmd.check() {
+                let err = Error::ParserError(e, Some((self.scanner.line(), self.scanner.column())));
+                return Err(err);
+            }
+        }
         Ok(cmd)
     }
 }
 
+/// SQL token
 pub type Token<'input> = (&'input [u8], TokenType);
 
+/// SQL lexer
 #[derive(Default)]
 pub struct Tokenizer {}
 
 impl Tokenizer {
+    /// Constructor
     pub fn new() -> Tokenizer {
         Tokenizer {}
     }
 }
 
-/// ```compile_fail
+/// ```rust
 /// use sqlite3_parser::lexer::sql::Tokenizer;
 /// use sqlite3_parser::lexer::Scanner;
 ///
 /// let tokenizer = Tokenizer::new();
 /// let input = "PRAGMA parser_trace=ON;".as_bytes();
-/// let mut s = Scanner::new(input, tokenizer);
-/// let (token1, _) = s.scan().unwrap().unwrap();
-/// s.scan().unwrap().unwrap();
+/// let mut s = Scanner::new(tokenizer);
+/// let Ok((_, Some((token1, _)), _)) = s.scan(input) else { panic!() };
+/// s.scan(input).unwrap();
 /// assert!(b"PRAGMA".eq_ignore_ascii_case(token1));
 /// ```
 impl Splitter for Tokenizer {