From be2610767e52697f377eaae22b65e2abd53f4f18 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Tue, 5 Nov 2024 13:30:18 +0100 Subject: [PATCH] Expand LIKE simplification - cover expression known not to be null - cover NULL pattern - cover repeated '%%' in pattern --- datafusion-cli/Cargo.lock | 1 + datafusion/optimizer/Cargo.toml | 1 + .../simplify_expressions/expr_simplifier.rs | 260 +++++++++++++----- .../test_files/string/init_data.slt.part | 1 + .../test_files/string/string_query.slt.part | 43 ++- 5 files changed, 215 insertions(+), 91 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 541d464d381f..a84a5de08e10 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1482,6 +1482,7 @@ dependencies = [ "itertools", "log", "paste", + "regex", "regex-syntax", ] diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml index 79a5bb24e918..bc9f0f850094 100644 --- a/datafusion/optimizer/Cargo.toml +++ b/datafusion/optimizer/Cargo.toml @@ -47,6 +47,7 @@ indexmap = { workspace = true } itertools = { workspace = true } log = { workspace = true } paste = "1.0.14" +regex = "1.11.0" regex-syntax = "0.8.0" [dev-dependencies] diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index e0df6a3a68ce..57c59146f77a 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -49,6 +49,7 @@ use crate::analyzer::type_coercion::TypeCoercionRewriter; use crate::simplify_expressions::guarantees::GuaranteeRewriter; use crate::simplify_expressions::regex::simplify_regex_expr; use crate::simplify_expressions::SimplifyInfo; +use regex::Regex; use super::inlist_simplifier::ShortenInListSimplifier; use super::utils::*; @@ -1470,34 +1471,54 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { }) => Transformed::yes(simplify_regex_expr(left, op, right)?), // Rules for Like - Expr::Like(Like { - expr, - pattern, - negated, - escape_char: _, - case_insensitive: _, - }) if matches!( - pattern.as_ref(), - Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if pattern_str == "%" - ) || matches!( - pattern.as_ref(), - Expr::Literal(ScalarValue::LargeUtf8(Some(pattern_str))) if pattern_str == "%" - ) || matches!( - pattern.as_ref(), - Expr::Literal(ScalarValue::Utf8View(Some(pattern_str))) if pattern_str == "%" - ) => - { - // exp LIKE '%' is - // - when exp is not NULL, it's true - // - when exp is NULL, it's NULL - // exp NOT LIKE '%' is - // - when exp is not NULL, it's false - // - when exp is NULL, it's NULL - Transformed::yes(Expr::Case(Case { - expr: Some(Box::new(Expr::IsNotNull(expr))), - when_then_expr: vec![(Box::new(lit(true)), Box::new(lit(!negated)))], - else_expr: None, - })) + Expr::Like(like) => { + match as_string_scalar(&like.pattern) { + Some((data_type, pattern_str)) => { + match pattern_str { + None => return Ok(Transformed::yes(lit_bool_null())), + Some(pattern_str) if pattern_str == "%" => { + // exp LIKE '%' is + // - when exp is not NULL, it's true + // - when exp is NULL, it's NULL + // exp NOT LIKE '%' is + // - when exp is not NULL, it's false + // - when exp is NULL, it's NULL + let result_for_non_null = lit(!like.negated); + Transformed::yes(if !info.nullable(&like.expr)? { + result_for_non_null + } else { + Expr::Case(Case { + expr: Some(Box::new(Expr::IsNotNull(like.expr))), + when_then_expr: vec![( + Box::new(lit(true)), + Box::new(result_for_non_null), + )], + else_expr: None, + }) + }) + } + Some(pattern_str) + if pattern_str.contains("%%") && + // TODO support more complete unescaping + (like.escape_char.is_none() || pattern_str.contains(like.escape_char.unwrap())) => + { + let simplified_pattern = Regex::new("%%+") + .unwrap() + .replace_all(pattern_str, "%") + .to_string(); + Transformed::yes(Expr::Like(Like { + pattern: Box::new(to_string_scalar( + data_type, + Some(simplified_pattern), + )), + ..like + })) + } + Some(_pattern_str) => Transformed::no(Expr::Like(like)), + } + } + None => Transformed::no(Expr::Like(like)), + } } // a is not null/unknown --> true (if a is not nullable) @@ -1696,6 +1717,24 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { } } +fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option)> { + match expr { + Expr::Literal(ScalarValue::Utf8(s)) => Some((DataType::Utf8, s)), + Expr::Literal(ScalarValue::LargeUtf8(s)) => Some((DataType::LargeUtf8, s)), + Expr::Literal(ScalarValue::Utf8View(s)) => Some((DataType::Utf8View, s)), + _ => None, + } +} + +fn to_string_scalar(data_type: DataType, value: Option) -> Expr { + match data_type { + DataType::Utf8 => Expr::Literal(ScalarValue::Utf8(value)), + DataType::LargeUtf8 => Expr::Literal(ScalarValue::LargeUtf8(value)), + DataType::Utf8View => Expr::Literal(ScalarValue::Utf8View(value)), + _ => unreachable!(), + } +} + fn has_common_conjunction(lhs: &Expr, rhs: &Expr) -> bool { let lhs_set: HashSet<&Expr> = iter_conjunction(lhs).collect(); iter_conjunction(rhs).any(|e| lhs_set.contains(&e) && !e.is_volatile()) @@ -2810,10 +2849,16 @@ mod tests { ); // single character - assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), "%x%")); + assert_change( + regex_match(col("c1"), lit("x")), + like(col("c1"), lit("%x%")), + ); // single word - assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), "%foo%")); + assert_change( + regex_match(col("c1"), lit("foo")), + like(col("c1"), lit("%foo%")), + ); // regular expressions that match an exact literal assert_change(regex_match(col("c1"), lit("^$")), col("c1").eq(lit(""))); @@ -2900,44 +2945,50 @@ mod tests { assert_no_change(regex_match(col("c1"), lit("$foo^"))); // regular expressions that match a partial literal - assert_change(regex_match(col("c1"), lit("^foo")), like(col("c1"), "foo%")); - assert_change(regex_match(col("c1"), lit("foo$")), like(col("c1"), "%foo")); + assert_change( + regex_match(col("c1"), lit("^foo")), + like(col("c1"), lit("foo%")), + ); + assert_change( + regex_match(col("c1"), lit("foo$")), + like(col("c1"), lit("%foo")), + ); assert_change( regex_match(col("c1"), lit("^foo|bar$")), - like(col("c1"), "foo%").or(like(col("c1"), "%bar")), + like(col("c1"), lit("foo%")).or(like(col("c1"), lit("%bar"))), ); // OR-chain assert_change( regex_match(col("c1"), lit("foo|bar|baz")), - like(col("c1"), "%foo%") - .or(like(col("c1"), "%bar%")) - .or(like(col("c1"), "%baz%")), + like(col("c1"), lit("%foo%")) + .or(like(col("c1"), lit("%bar%"))) + .or(like(col("c1"), lit("%baz%"))), ); assert_change( regex_match(col("c1"), lit("foo|x|baz")), - like(col("c1"), "%foo%") - .or(like(col("c1"), "%x%")) - .or(like(col("c1"), "%baz%")), + like(col("c1"), lit("%foo%")) + .or(like(col("c1"), lit("%x%"))) + .or(like(col("c1"), lit("%baz%"))), ); assert_change( regex_not_match(col("c1"), lit("foo|bar|baz")), - not_like(col("c1"), "%foo%") - .and(not_like(col("c1"), "%bar%")) - .and(not_like(col("c1"), "%baz%")), + not_like(col("c1"), lit("%foo%")) + .and(not_like(col("c1"), lit("%bar%"))) + .and(not_like(col("c1"), lit("%baz%"))), ); // both anchored expressions (translated to equality) and unanchored assert_change( regex_match(col("c1"), lit("foo|^x$|baz")), - like(col("c1"), "%foo%") + like(col("c1"), lit("%foo%")) .or(col("c1").eq(lit("x"))) - .or(like(col("c1"), "%baz%")), + .or(like(col("c1"), lit("%baz%"))), ); assert_change( regex_not_match(col("c1"), lit("foo|^bar$|baz")), - not_like(col("c1"), "%foo%") + not_like(col("c1"), lit("%foo%")) .and(col("c1").not_eq(lit("bar"))) - .and(not_like(col("c1"), "%baz%")), + .and(not_like(col("c1"), lit("%baz%"))), ); // Too many patterns (MAX_REGEX_ALTERNATIONS_EXPANSION) assert_no_change(regex_match(col("c1"), lit("foo|bar|baz|blarg|bozo|etc"))); @@ -2987,41 +3038,41 @@ mod tests { }) } - fn like(expr: Expr, pattern: &str) -> Expr { + fn like(expr: Expr, pattern: impl Into) -> Expr { Expr::Like(Like { negated: false, expr: Box::new(expr), - pattern: Box::new(lit(pattern)), + pattern: Box::new(pattern.into()), escape_char: None, case_insensitive: false, }) } - fn not_like(expr: Expr, pattern: &str) -> Expr { + fn not_like(expr: Expr, pattern: impl Into) -> Expr { Expr::Like(Like { negated: true, expr: Box::new(expr), - pattern: Box::new(lit(pattern)), + pattern: Box::new(pattern.into()), escape_char: None, case_insensitive: false, }) } - fn ilike(expr: Expr, pattern: &str) -> Expr { + fn ilike(expr: Expr, pattern: impl Into) -> Expr { Expr::Like(Like { negated: false, expr: Box::new(expr), - pattern: Box::new(lit(pattern)), + pattern: Box::new(pattern.into()), escape_char: None, case_insensitive: true, }) } - fn not_ilike(expr: Expr, pattern: &str) -> Expr { + fn not_ilike(expr: Expr, pattern: impl Into) -> Expr { Expr::Like(Like { negated: true, expr: Box::new(expr), - pattern: Box::new(lit(pattern)), + pattern: Box::new(pattern.into()), escape_char: None, case_insensitive: true, }) @@ -3633,31 +3684,112 @@ mod tests { #[test] fn test_like_and_ilke() { - // LIKE '%' - let expr = like(col("c1"), "%"); + let null = lit(ScalarValue::Utf8(None)); + + // expr [NOT] [I]LIKE NULL + let expr = like(col("c1"), null.clone()); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = not_like(col("c1"), null.clone()); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = ilike(col("c1"), null.clone()); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = not_ilike(col("c1"), null.clone()); + assert_eq!(simplify(expr), lit_bool_null()); + + // expr [NOT] [I]LIKE '%' + let expr = like(col("c1"), lit("%")); + assert_eq!(simplify(expr), if_not_null(col("c1"), true)); + + let expr = not_like(col("c1"), lit("%")); + assert_eq!(simplify(expr), if_not_null(col("c1"), false)); + + let expr = ilike(col("c1"), lit("%")); + assert_eq!(simplify(expr), if_not_null(col("c1"), true)); + + let expr = not_ilike(col("c1"), lit("%")); + assert_eq!(simplify(expr), if_not_null(col("c1"), false)); + + // expr [NOT] [I]LIKE '%%' + let expr = like(col("c1"), lit("%%")); assert_eq!(simplify(expr), if_not_null(col("c1"), true)); - let expr = not_like(col("c1"), "%"); + let expr = not_like(col("c1"), lit("%%")); assert_eq!(simplify(expr), if_not_null(col("c1"), false)); - let expr = ilike(col("c1"), "%"); + let expr = ilike(col("c1"), lit("%%")); assert_eq!(simplify(expr), if_not_null(col("c1"), true)); - let expr = not_ilike(col("c1"), "%"); + let expr = not_ilike(col("c1"), lit("%%")); assert_eq!(simplify(expr), if_not_null(col("c1"), false)); - // null_constant LIKE '%' + // not_null_expr [NOT] [I]LIKE '%' + let expr = like(col("c1_non_null"), lit("%")); + assert_eq!(simplify(expr), lit(true)); + + let expr = not_like(col("c1_non_null"), lit("%")); + assert_eq!(simplify(expr), lit(false)); + + let expr = ilike(col("c1_non_null"), lit("%")); + assert_eq!(simplify(expr), lit(true)); + + let expr = not_ilike(col("c1_non_null"), lit("%")); + assert_eq!(simplify(expr), lit(false)); + + // not_null_expr [NOT] [I]LIKE '%%' + let expr = like(col("c1_non_null"), lit("%%")); + assert_eq!(simplify(expr), lit(true)); + + let expr = not_like(col("c1_non_null"), lit("%%")); + assert_eq!(simplify(expr), lit(false)); + + let expr = ilike(col("c1_non_null"), lit("%%")); + assert_eq!(simplify(expr), lit(true)); + + let expr = not_ilike(col("c1_non_null"), lit("%%")); + assert_eq!(simplify(expr), lit(false)); + + // null_constant [NOT] [I]LIKE '%' + let expr = like(null.clone(), lit("%")); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = not_like(null.clone(), lit("%")); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = ilike(null.clone(), lit("%")); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = not_ilike(null, lit("%")); + assert_eq!(simplify(expr), lit_bool_null()); + + // null_constant [NOT] [I]LIKE '%%' + let null = lit(ScalarValue::Utf8(None)); + let expr = like(null.clone(), lit("%%")); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = not_like(null.clone(), lit("%%")); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = ilike(null.clone(), lit("%%")); + assert_eq!(simplify(expr), lit_bool_null()); + + let expr = not_ilike(null, lit("%%")); + assert_eq!(simplify(expr), lit_bool_null()); + + // null_constant [NOT] [I]LIKE 'a%' let null = lit(ScalarValue::Utf8(None)); - let expr = like(null.clone(), "%"); + let expr = like(null.clone(), lit("a%")); assert_eq!(simplify(expr), lit_bool_null()); - let expr = not_like(null.clone(), "%"); + let expr = not_like(null.clone(), lit("a%")); assert_eq!(simplify(expr), lit_bool_null()); - let expr = ilike(null.clone(), "%"); + let expr = ilike(null.clone(), lit("a%")); assert_eq!(simplify(expr), lit_bool_null()); - let expr = not_ilike(null, "%"); + let expr = not_ilike(null, lit("a%")); assert_eq!(simplify(expr), lit_bool_null()); } diff --git a/datafusion/sqllogictest/test_files/string/init_data.slt.part b/datafusion/sqllogictest/test_files/string/init_data.slt.part index e3914ea49855..28a93239a273 100644 --- a/datafusion/sqllogictest/test_files/string/init_data.slt.part +++ b/datafusion/sqllogictest/test_files/string/init_data.slt.part @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +# TODO (https://github.com/apache/datafusion/issues/12637): add a row with '%%' pattern statement ok create table test_source as values ('Andrew', 'X', 'datafusion📊🔥', '🔥'), diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index 57fb09bca9e4..4bb3aa563ea9 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -873,41 +873,30 @@ NULL NULL NULL NULL NULL #Raphael datafusionДатаФусион false false false false #NULL NULL NULL NULL NULL NULL -# TODO (https://github.com/apache/datafusion/issues/12637) uncomment additional test projections -query TTBB +query TTBBBB SELECT ascii_1, unicode_1, ascii_1 LIKE '%' AS ascii_1_like_percent, - unicode_1 LIKE '%' AS unicode_1_like_percent - -- ascii_1 LIKE '%%' AS ascii_1_like_percent_percent, -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 - -- unicode_1 LIKE '%%' AS unicode_1_like_percent_percent -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 + unicode_1 LIKE '%' AS unicode_1_like_percent, + ascii_1 LIKE '%%' AS ascii_1_like_percent_percent, + unicode_1 LIKE '%%' AS unicode_1_like_percent_percent FROM test_basic_operator ---- -Andrew datafusion📊🔥 true true -Xiangpeng datafusion数据融合 true true -Raphael datafusionДатаФусион true true -under_score un iść core true true -percent pan Tadeusz ma iść w kąt true true -(empty) (empty) true true -NULL NULL NULL NULL -NULL NULL NULL NULL +Andrew datafusion📊🔥 true true true true +Xiangpeng datafusion数据融合 true true true true +Raphael datafusionДатаФусион true true true true +under_score un iść core true true true true +percent pan Tadeusz ma iść w kąt true true true true +(empty) (empty) true true true true +NULL NULL NULL NULL NULL NULL +NULL NULL NULL NULL NULL NULL -# TODO (https://github.com/apache/datafusion/issues/12637) uncomment additional test projections -query TTBB +query error DataFusion error: SQL error: ParserError\("Expected an expression, found: FROM"\) SELECT ascii_1, unicode_1, ascii_1 NOT LIKE '%' AS ascii_1_not_like_percent, - unicode_1 NOT LIKE '%' AS unicode_1_not_like_percent - -- ascii_1 NOT LIKE '%%' AS ascii_1_not_like_percent_percent, -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 - -- unicode_1 NOT LIKE '%%' AS unicode_1_not_like_percent_percent -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637 + unicode_1 NOT LIKE '%' AS unicode_1_not_like_percent, + ascii_1 NOT LIKE '%%' AS ascii_1_not_like_percent_percent, + unicode_1 NOT LIKE '%%' AS unicode_1_not_like_percent_percent, FROM test_basic_operator ----- -Andrew datafusion📊🔥 false false -Xiangpeng datafusion数据融合 false false -Raphael datafusionДатаФусион false false -under_score un iść core false false -percent pan Tadeusz ma iść w kąt false false -(empty) (empty) false false -NULL NULL NULL NULL -NULL NULL NULL NULL query T SELECT ascii_1 FROM test_basic_operator WHERE ascii_1 LIKE '%'