From 7407721f077a9952dbdd278e86e99bb81141c509 Mon Sep 17 00:00:00 2001 From: Eric Kidd Date: Fri, 13 Oct 2023 09:30:02 -0400 Subject: [PATCH] Prepare to write function tests - Keep track of which versions of functions we call. - Add comments for pending tests - Add a TODO list of functions to implement --- src/analyze.rs | 48 +++++++++++++++++---- src/cmd/sql_test.rs | 33 ++++++++------- tests/sql/data_types/literal_scalars.sql | 2 +- tests/sql/functions/README.md | 53 ++++++++++++++++++++++++ tests/sql/functions/simple/regexp.sql | 20 +++++++++ 5 files changed, 131 insertions(+), 25 deletions(-) create mode 100644 tests/sql/functions/README.md create mode 100644 tests/sql/functions/simple/regexp.sql diff --git a/src/analyze.rs b/src/analyze.rs index 2a42e9e..93b0620 100644 --- a/src/analyze.rs +++ b/src/analyze.rs @@ -5,11 +5,16 @@ use std::collections::HashMap; use derive_visitor::{Drive, Visitor}; -use crate::ast::{FunctionName, SpecialDateFunctionCall, SqlProgram}; +use crate::ast::{FunctionCall, SpecialDateFunctionCall, SqlProgram}; + +/// A `phf` set of functions that are known to take any number of arguments. +static KNOWN_VARARG_FUNCTIONS: phf::Set<&'static str> = phf::phf_set! { + "COALESCE", "CONCAT", +}; /// Count all the function calls in a [`SqlProgram`]. #[derive(Debug, Default, Visitor)] -#[visitor(FunctionName(enter), SpecialDateFunctionCall(enter))] +#[visitor(FunctionCall(enter), SpecialDateFunctionCall(enter))] pub struct FunctionCallCounts { counts: HashMap, } @@ -20,24 +25,51 @@ impl FunctionCallCounts { sql_program.drive(self) } - fn record_call(&mut self, name: &str) { - let count = self.counts.entry(name.to_ascii_uppercase()).or_default(); + fn record_call(&mut self, name: String) { + let count = self.counts.entry(name).or_default(); *count += 1; } - fn enter_function_name(&mut self, function_name: &FunctionName) { - self.record_call(&function_name.unescaped_bigquery()); + fn enter_function_call(&mut self, function_call: &FunctionCall) { + let base_name = function_call.name.unescaped_bigquery().to_ascii_uppercase(); + let mut name = format!("{}(", base_name); + if KNOWN_VARARG_FUNCTIONS.contains(base_name.as_str()) { + name.push('*'); + } else { + // Push '_' separated by ','. + for i in 0..function_call.args.nodes.len() { + if i > 0 { + name.push(','); + } + name.push('_'); + } + } + name.push(')'); + if function_call.over_clause.is_some() { + name.push_str(" OVER(..)"); + } + self.record_call(name); } fn enter_special_date_function_call( &mut self, special_date_function_call: &SpecialDateFunctionCall, ) { - self.record_call( + let mut name = format!( + "{}(", special_date_function_call .function_name - .unescaped_bigquery(), + .unescaped_bigquery() + .to_ascii_uppercase(), ); + for i in 0..special_date_function_call.args.nodes.len() { + if i > 0 { + name.push(','); + } + name.push('_'); + } + name.push_str(") (special)"); + self.record_call(name); } /// Get a list of functions and how often they were called, sorted by diff --git a/src/cmd/sql_test.rs b/src/cmd/sql_test.rs index 243e30c..02437b6 100644 --- a/src/cmd/sql_test.rs +++ b/src/cmd/sql_test.rs @@ -42,7 +42,7 @@ pub async fn cmd_sql_test(opt: &SqlTestOpt) -> Result<()> { // Keep track of our test results. let mut test_count = 0usize; let mut test_failures: Vec<(PathBuf, Error)> = vec![]; - let mut pending_paths: Vec = vec![]; + let mut pending: Vec<(PathBuf, String)> = vec![]; // Build a glob matching our test files, for use with `glob`. let dir_path_str = opt.dir_path.as_os_str().to_str().ok_or_else(|| { @@ -72,22 +72,23 @@ pub async fn cmd_sql_test(opt: &SqlTestOpt) -> Result<()> { // Skip pending tests unless asked to run them. if !opt.pending { - // Look for lines of the form `-- pending: db1, db2, ...`. - static PENDING_RE: Lazy = Lazy::new(|| { - Regex::new(r"(?m)^--\s*pending:\s*([a-zA-Z0-9_][a-zA-Z0-9_, ]*)").unwrap() - }); + // Look for lines of the form `-- pending: db1 Comment`. + static PENDING_RE: Lazy = + Lazy::new(|| Regex::new(r"(?m)^--\s*pending:\s*([a-zA-Z0-9_]+)(\s+.*)?").unwrap()); let target_string = driver.target().to_string(); if let Some(caps) = PENDING_RE.captures(&query) { - let dbs = caps.get(1).unwrap().as_str(); - if dbs.split(',').any(|db| db.trim() == target_string) { + let db = caps.get(1).unwrap().as_str(); + let comment = caps.get(2).map_or("", |m| m.as_str().trim()); + if db == target_string { print!("P"); let _ = io::stdout().flush(); - pending_paths.push( + pending.push(( path.strip_prefix(&base_dir) .unwrap_or_else(|_| &path) .to_owned(), - ); + comment.to_owned(), + )); continue; } @@ -114,10 +115,10 @@ pub async fn cmd_sql_test(opt: &SqlTestOpt) -> Result<()> { e.emit(); } - if !pending_paths.is_empty() { + if !pending.is_empty() { println!("\nPending tests:"); - for path in &pending_paths { - println!(" {}", path.display()); + for (path, comment) in &pending { + println!(" {} ({})", path.display(), comment); } } @@ -125,8 +126,8 @@ pub async fn cmd_sql_test(opt: &SqlTestOpt) -> Result<()> { Err(Error::Other("No tests found".into())) } else if test_failures.is_empty() { print!("\nOK: {} tests passed", test_count); - if !pending_paths.is_empty() { - print!(", {} pending", pending_paths.len()); + if !pending.is_empty() { + print!(", {} pending", pending.len()); } println!(); Ok(()) @@ -136,8 +137,8 @@ pub async fn cmd_sql_test(opt: &SqlTestOpt) -> Result<()> { test_failures.len(), test_count, ); - if !pending_paths.is_empty() { - print!(", {} pending", pending_paths.len()); + if !pending.is_empty() { + print!(", {} pending", pending.len()); } println!(); diff --git a/tests/sql/data_types/literal_scalars.sql b/tests/sql/data_types/literal_scalars.sql index ca6d28c..16105b3 100644 --- a/tests/sql/data_types/literal_scalars.sql +++ b/tests/sql/data_types/literal_scalars.sql @@ -1,4 +1,4 @@ --- pending: snowflake +-- pending: snowflake Test harness Arrow library reads 1.5 as 15 CREATE OR REPLACE TABLE __result1 AS SELECT diff --git a/tests/sql/functions/README.md b/tests/sql/functions/README.md new file mode 100644 index 0000000..a7a3b2b --- /dev/null +++ b/tests/sql/functions/README.md @@ -0,0 +1,53 @@ +# Function tests + +This directory contains tests for BigQuery SQL functions, to see if we can run +them on other platforms. + +## Tests to implement + +Here is a list of functions that are high priorities to implement. You can +generate your own version of this list by running `joinery parse +--count-function-calls queries.csv`. + +- [ ] REGEXP_REPLACE(_,_,_) +- [ ] REGEXP_EXTRACT(_,_) +- [ ] COALESCE(*) +- [ ] LOWER(_) +- [ ] TO_HEX(_) +- [ ] SHA256(_) +- [ ] LENGTH(_) +- [ ] CONCAT(*) +- [ ] TRIM(_) +- [ ] ARRAY_TO_STRING(_,_) +- [ ] SUM(_) +- [ ] FARM_FINGERPRINT(_) +- [ ] ANY_VALUE(_) +- [ ] ROW_NUMBER() OVER(..) +- [ ] COUNTIF(_) +- [ ] UPPER(_) +- [ ] ARRAY_AGG(_) +- [ ] DATE_TRUNC(_,_) (special) +- [ ] MIN(_) +- [ ] FORMAT_DATETIME(_,_) +- [ ] RAND() +- [ ] RANK() OVER(..) +- [ ] ARRAY_LENGTH(_) +- [ ] SUM(_) OVER(..) +- [ ] DATETIME_SUB(_,_) +- [ ] DATE_DIFF(_,_,_) (special) +- [ ] CURRENT_DATETIME() +- [ ] DATE_SUB(_,_) +- [ ] EXP(_) +- [ ] MAX(_) +- [ ] GENERATE_UUID() +- [ ] DATE(_) +- [ ] LEAST(_,_) +- [ ] APPROX_QUANTILES(_,_) +- [ ] GENERATE_DATE_ARRAY(_,_,_) +- [ ] DATE_ADD(_,_) +- [ ] LAG(_) OVER(..) +- [ ] DATETIME_DIFF(_,_,_) (special) +- [ ] DATETIME_TRUNC(_,_) (special) +- [ ] FIRST_VALUE(_) OVER(..) +- [ ] DATETIME(_) +- [ ] LEAST(_) diff --git a/tests/sql/functions/simple/regexp.sql b/tests/sql/functions/simple/regexp.sql new file mode 100644 index 0000000..518153f --- /dev/null +++ b/tests/sql/functions/simple/regexp.sql @@ -0,0 +1,20 @@ +-- pending: snowflake REGEX_EXTRACT needs to be wrapped with REGEXP_SUBSTR +-- pending: sqlite3 No regex fuctions +-- +-- REGEXP_REPLACE, REGEXP_EXTRACT +-- +-- https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions +-- +-- We should consider testing a larger set of regular expression features, +-- because different databases may support different regex syntax. + +CREATE OR REPLACE TABLE __result1 AS +SELECT + REGEXP_REPLACE('foo', r'oo', 'ee') AS replaced, + REGEXP_EXTRACT('foobar', r'o+') AS extracted; + +CREATE OR REPLACE TABLE __expected1 ( + replaced STRING, + extracted STRING, +); +INSERT INTO __expected1 VALUES ('fee', 'oo');