From 6653d8e4c4c068bbade700c2db919735384b7ed6 Mon Sep 17 00:00:00 2001 From: Kyle Lahnakoski Date: Sat, 7 Sep 2024 08:16:31 -0400 Subject: [PATCH 1/6] add regex ops --- mo_sql_parsing/keywords.py | 14 +++++++++----- mo_sql_parsing/utils.py | 4 ++++ tests/test_postgres.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/mo_sql_parsing/keywords.py b/mo_sql_parsing/keywords.py index 40df99f..cd789bb 100644 --- a/mo_sql_parsing/keywords.py +++ b/mo_sql_parsing/keywords.py @@ -122,7 +122,10 @@ # https://prestodb.io/docs/current/functions/comparison.html#is-distinct-from-and-is-not-distinct-from keyword("is not distinct from").set_parser_name("ne!") ) -REGEXP = keyword("regexp").set_parser_name("rgx") +REGEXP = (keyword("regexp") | Literal("~")).set_parser_name("regexp") +NOT_REGEXP = Literal("!~").set_parser_name("not_regexp") +REGEXP_I = Literal("~*").set_parser_name("regexp_i") +NOT_REGEXP_I = Literal("!~*").set_parser_name("not_regexp_i") NEQ = (Literal("!=") | Literal("<>")).set_parser_name("neq") ASSIGN = Literal(":=").set_parser_name("assign") @@ -324,8 +327,10 @@ "lt": 5, "gt": 6, "eq": 7, - "rgx": 7, - "not_rgx": 7, + "regexp": 7, + "not_regexp": 7, + "regexp_i": 7, + "not_regexp_i": 7, "neq": 7, "missing": 7, "exists": 7, @@ -409,8 +414,7 @@ AND, OR, ASSIGN, - REGEXP, - NOT_REGEXP, + REGEXP | NOT_REGEXP | REGEXP_I | NOT_REGEXP_I, ] times = ["now", "today", "tomorrow", "eod"] diff --git a/mo_sql_parsing/utils.py b/mo_sql_parsing/utils.py index 8439e03..f92e3e0 100644 --- a/mo_sql_parsing/utils.py +++ b/mo_sql_parsing/utils.py @@ -264,6 +264,10 @@ def to_tuple_call(token, index, string): "<=>": "eq!", # https://sparkbyexamples.com/apache-hive/hive-relational-arithmetic-logical-operators/ "!=": "neq", "<>": "neq", + "!~*": "not_regexp_i", + "!~": "not_regexp", + "~*": "regexp_i", + "~": "regexp", "not in": "nin", "in": "in", "is_not": "neq", diff --git a/tests/test_postgres.py b/tests/test_postgres.py index 948b4c4..9c97ca2 100644 --- a/tests/test_postgres.py +++ b/tests/test_postgres.py @@ -544,3 +544,33 @@ def test_issue_239_jsonb2(self): "select": {"value": {"json_get_text": [{"cast": ["name", {"jsonb": {}}]}, {"literal": "field_key"}]}}, } self.assertEqual(result, expected) + + def test_issue_248_regex_operator1(self): + # https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-POSIX-REGEXP + sql = """SELECT 'abc' ~ 'abc'""" + result = parse(sql) + expected = {"select": {"value": {"regexp": ["abc", "abc"]}}} + + def test_issue_248_regex_operator2(self): + sql = """SELECT 'abc' ~* 'abc'""" + result = parse(sql) + expected = {"select": {"value": {"regexp": ["abc", "abc"], "ignore_case": True}}} + self.assertEqual(result, expected) + + def test_issue_248_regex_operator3(self): + sql = """SELECT 'abc' !~ 'abc'""" + try: + result = parse(sql) + except Exception: + pass + with Debugger(): + result = parse(sql) + + expected = {"select": {"value": {"not_regexp": ["abc", "abc"]}}} + self.assertEqual(result, expected) + + def test_issue_248_regex_operator4(self): + sql = """SELECT 'abc' !~* 'abc'""" + result = parse(sql) + expected = {"select": {"value": {"not_regexp": ["abc", "abc"], "ignore_case": True}}} + self.assertEqual(result, expected) From 582533f98c22cf005afeeb8e4dbc953ca0a3a861 Mon Sep 17 00:00:00 2001 From: Kyle Lahnakoski Date: Sat, 7 Sep 2024 10:56:42 -0400 Subject: [PATCH 2/6] still broken --- mo_sql_parsing/keywords.py | 11 ++++++++--- mo_sql_parsing/utils.py | 7 ++++++- tests/test_postgres.py | 15 ++++++++++----- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/mo_sql_parsing/keywords.py b/mo_sql_parsing/keywords.py index cd789bb..bc17bb8 100644 --- a/mo_sql_parsing/keywords.py +++ b/mo_sql_parsing/keywords.py @@ -86,6 +86,9 @@ PRIMARY_KEY = Group(PRIMARY + KEY).set_parser_name("primary_key") FOREIGN_KEY = Group(FOREIGN + KEY).set_parser_name("foreign_key") +NOT = keyword("not") + + # SIMPLE OPERATORS CONCAT = Literal("||").set_parser_name("concat") MUL = Literal("*").set_parser_name("mul") @@ -123,7 +126,7 @@ keyword("is not distinct from").set_parser_name("ne!") ) REGEXP = (keyword("regexp") | Literal("~")).set_parser_name("regexp") -NOT_REGEXP = Literal("!~").set_parser_name("not_regexp") +NOT_REGEXP = (NOT + keyword("regexp") | Literal("!~")).set_parser_name("not_regexp") REGEXP_I = Literal("~*").set_parser_name("regexp_i") NOT_REGEXP_I = Literal("!~*").set_parser_name("not_regexp_i") NEQ = (Literal("!=") | Literal("<>")).set_parser_name("neq") @@ -150,7 +153,6 @@ ELSE = keyword("else").suppress() IN = keyword("in") IS = keyword("is") -NOT = keyword("not") OR = keyword("or") LATERAL = keyword("lateral") PIVOT = keyword("pivot") @@ -414,7 +416,10 @@ AND, OR, ASSIGN, - REGEXP | NOT_REGEXP | REGEXP_I | NOT_REGEXP_I, + NOT_REGEXP_I, + NOT_REGEXP, + REGEXP_I, + REGEXP, ] times = ["now", "today", "tomorrow", "eod"] diff --git a/mo_sql_parsing/utils.py b/mo_sql_parsing/utils.py index f92e3e0..bce7dff 100644 --- a/mo_sql_parsing/utils.py +++ b/mo_sql_parsing/utils.py @@ -9,6 +9,7 @@ import ast import sys +from typing import List from mo_dots import is_data, is_null, literal_field, unliteral_field from mo_future import text, number_types, binary_type, flatten @@ -22,7 +23,7 @@ class Call(object): __slots__ = ["op", "args", "kwargs"] - def __init__(self, op, args, kwargs): + def __init__(self, op, args : List, kwargs: Dict): self.op = op self.args = args self.kwargs = kwargs @@ -169,6 +170,10 @@ def to_json_operator(tokens): return Call("exists", tokens[0], {}) else: return Call("missing", tokens[0], {}) + elif op == "regexp_i": + return Call("regexp", [tokens[0], tokens[2]], {"ignore_case": True}) + elif op == "not_regexp_i": + return Call("not_regexp", [tokens[0], tokens[2]], {"ignore_case": True}) operands = [tokens[0], tokens[2]] binary_op = Call(op, operands, {}) diff --git a/tests/test_postgres.py b/tests/test_postgres.py index 9c97ca2..276c6c8 100644 --- a/tests/test_postgres.py +++ b/tests/test_postgres.py @@ -549,12 +549,17 @@ def test_issue_248_regex_operator1(self): # https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-POSIX-REGEXP sql = """SELECT 'abc' ~ 'abc'""" result = parse(sql) - expected = {"select": {"value": {"regexp": ["abc", "abc"]}}} + expected = {"select": {"value": {"regexp": [{"literal": "abc"}, {"literal": "abc"}]}}} def test_issue_248_regex_operator2(self): sql = """SELECT 'abc' ~* 'abc'""" - result = parse(sql) - expected = {"select": {"value": {"regexp": ["abc", "abc"], "ignore_case": True}}} + try: + result = parse(sql) + except Exception: + pass + with Debugger(): + result = parse(sql) + expected = {"select": {"value": {"regexp": [{"literal": "abc"}, {"literal": "abc"}], "ignore_case": True}}} self.assertEqual(result, expected) def test_issue_248_regex_operator3(self): @@ -566,11 +571,11 @@ def test_issue_248_regex_operator3(self): with Debugger(): result = parse(sql) - expected = {"select": {"value": {"not_regexp": ["abc", "abc"]}}} + expected = {"select": {"value": {"not_regexp": [{"literal": "abc"}, {"literal": "abc"}]}}} self.assertEqual(result, expected) def test_issue_248_regex_operator4(self): sql = """SELECT 'abc' !~* 'abc'""" result = parse(sql) - expected = {"select": {"value": {"not_regexp": ["abc", "abc"], "ignore_case": True}}} + expected = {"select": {"value": {"regexp": [{"literal": "abc"}, {"literal": "abc"}], "ignore_case": True}}} self.assertEqual(result, expected) From 81f64407dc5c6ae6087c5b6e3d2b808cfef121f3 Mon Sep 17 00:00:00 2001 From: Kyle Lahnakoski Date: Sun, 8 Sep 2024 10:50:39 -0400 Subject: [PATCH 3/6] found problem. fixed --- mo_sql_parsing/keywords.py | 6 +++--- tests/test_postgres.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mo_sql_parsing/keywords.py b/mo_sql_parsing/keywords.py index bc17bb8..10212ee 100644 --- a/mo_sql_parsing/keywords.py +++ b/mo_sql_parsing/keywords.py @@ -126,7 +126,7 @@ keyword("is not distinct from").set_parser_name("ne!") ) REGEXP = (keyword("regexp") | Literal("~")).set_parser_name("regexp") -NOT_REGEXP = (NOT + keyword("regexp") | Literal("!~")).set_parser_name("not_regexp") +NOT_REGEXP = Group(NOT + keyword("regexp") | Literal("!~")).set_parser_name("not_regexp") REGEXP_I = Literal("~*").set_parser_name("regexp_i") NOT_REGEXP_I = Literal("!~*").set_parser_name("not_regexp_i") NEQ = (Literal("!=") | Literal("<>")).set_parser_name("neq") @@ -190,7 +190,6 @@ NOT_LIKE = Group(NOT + LIKE).set_parser_name("not_like") NOT_RLIKE = Group(NOT + RLIKE).set_parser_name("not_rlike") NOT_IN = Group(NOT + IN).set_parser_name("nin") -NOT_REGEXP = Group(NOT + REGEXP).set_parser_name("not_regexp") IS_NOT = Group(IS + NOT).set_parser_name("is_not") _SIMILAR = keyword("similar") @@ -396,7 +395,8 @@ BINARY_AND, BINARY_OR, GTE | LTE | LT | GT, - EEQ | NEQ | DEQ | IDF | INDF, + EEQ | DEQ | IDF | INDF, + NEQ, AT_TIME_ZONE, (BETWEEN, AND), (NOT_BETWEEN, AND), diff --git a/tests/test_postgres.py b/tests/test_postgres.py index 276c6c8..45930a6 100644 --- a/tests/test_postgres.py +++ b/tests/test_postgres.py @@ -577,5 +577,5 @@ def test_issue_248_regex_operator3(self): def test_issue_248_regex_operator4(self): sql = """SELECT 'abc' !~* 'abc'""" result = parse(sql) - expected = {"select": {"value": {"regexp": [{"literal": "abc"}, {"literal": "abc"}], "ignore_case": True}}} + expected = {"select": {"value": {"not_regexp": [{"literal": "abc"}, {"literal": "abc"}], "ignore_case": True}}} self.assertEqual(result, expected) From a4a3fb7c1db2733744904bd15b17fa9039c90724 Mon Sep 17 00:00:00 2001 From: Kyle Lahnakoski Date: Sun, 8 Sep 2024 10:57:15 -0400 Subject: [PATCH 4/6] formatting --- mo_sql_parsing/keywords.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mo_sql_parsing/keywords.py b/mo_sql_parsing/keywords.py index 10212ee..882336f 100644 --- a/mo_sql_parsing/keywords.py +++ b/mo_sql_parsing/keywords.py @@ -47,6 +47,7 @@ LIMIT = keyword("limit").suppress() MINUS = keyword("minus") NATURAL = keyword("natural") +NOT = keyword("not") OFFSET = keyword("offset").suppress() ON = keyword("on").suppress() ORDER = keyword("order").suppress() @@ -86,8 +87,6 @@ PRIMARY_KEY = Group(PRIMARY + KEY).set_parser_name("primary_key") FOREIGN_KEY = Group(FOREIGN + KEY).set_parser_name("foreign_key") -NOT = keyword("not") - # SIMPLE OPERATORS CONCAT = Literal("||").set_parser_name("concat") @@ -125,12 +124,11 @@ # https://prestodb.io/docs/current/functions/comparison.html#is-distinct-from-and-is-not-distinct-from keyword("is not distinct from").set_parser_name("ne!") ) +NEQ = (Literal("!=") | Literal("<>")).set_parser_name("neq") +ASSIGN = Literal(":=").set_parser_name("assign") REGEXP = (keyword("regexp") | Literal("~")).set_parser_name("regexp") -NOT_REGEXP = Group(NOT + keyword("regexp") | Literal("!~")).set_parser_name("not_regexp") REGEXP_I = Literal("~*").set_parser_name("regexp_i") NOT_REGEXP_I = Literal("!~*").set_parser_name("not_regexp_i") -NEQ = (Literal("!=") | Literal("<>")).set_parser_name("neq") -ASSIGN = Literal(":=").set_parser_name("assign") JSON_GET = Literal("->").set_parser_name("json_get") JSON_GET_TEXT = Literal("->>").set_parser_name("json_get_text") @@ -189,6 +187,7 @@ NOT_ILIKE = Group(NOT + ILIKE).set_parser_name("not_ilike") NOT_LIKE = Group(NOT + LIKE).set_parser_name("not_like") NOT_RLIKE = Group(NOT + RLIKE).set_parser_name("not_rlike") +NOT_REGEXP = Group(NOT + keyword("regexp") | Literal("!~")).set_parser_name("not_regexp") NOT_IN = Group(NOT + IN).set_parser_name("nin") IS_NOT = Group(IS + NOT).set_parser_name("is_not") From 595cf6727b500a8393dacf809807e4c8b0499284 Mon Sep 17 00:00:00 2001 From: Kyle Lahnakoski Date: Sun, 8 Sep 2024 10:58:54 -0400 Subject: [PATCH 5/6] formatting --- mo_sql_parsing/keywords.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mo_sql_parsing/keywords.py b/mo_sql_parsing/keywords.py index 882336f..6eb8a11 100644 --- a/mo_sql_parsing/keywords.py +++ b/mo_sql_parsing/keywords.py @@ -187,8 +187,8 @@ NOT_ILIKE = Group(NOT + ILIKE).set_parser_name("not_ilike") NOT_LIKE = Group(NOT + LIKE).set_parser_name("not_like") NOT_RLIKE = Group(NOT + RLIKE).set_parser_name("not_rlike") -NOT_REGEXP = Group(NOT + keyword("regexp") | Literal("!~")).set_parser_name("not_regexp") NOT_IN = Group(NOT + IN).set_parser_name("nin") +NOT_REGEXP = Group(NOT + keyword("regexp") | Literal("!~")).set_parser_name("not_regexp") IS_NOT = Group(IS + NOT).set_parser_name("is_not") _SIMILAR = keyword("similar") From 6572313c0739301a599be7ac77413c08c242e129 Mon Sep 17 00:00:00 2001 From: Kyle Lahnakoski Date: Sun, 8 Sep 2024 11:02:09 -0400 Subject: [PATCH 6/6] cleaner diff --- mo_sql_parsing/keywords.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/mo_sql_parsing/keywords.py b/mo_sql_parsing/keywords.py index 6eb8a11..7dde973 100644 --- a/mo_sql_parsing/keywords.py +++ b/mo_sql_parsing/keywords.py @@ -124,11 +124,11 @@ # https://prestodb.io/docs/current/functions/comparison.html#is-distinct-from-and-is-not-distinct-from keyword("is not distinct from").set_parser_name("ne!") ) -NEQ = (Literal("!=") | Literal("<>")).set_parser_name("neq") -ASSIGN = Literal(":=").set_parser_name("assign") REGEXP = (keyword("regexp") | Literal("~")).set_parser_name("regexp") REGEXP_I = Literal("~*").set_parser_name("regexp_i") NOT_REGEXP_I = Literal("!~*").set_parser_name("not_regexp_i") +NEQ = (Literal("!=") | Literal("<>")).set_parser_name("neq") +ASSIGN = Literal(":=").set_parser_name("assign") JSON_GET = Literal("->").set_parser_name("json_get") JSON_GET_TEXT = Literal("->>").set_parser_name("json_get_text") @@ -394,8 +394,7 @@ BINARY_AND, BINARY_OR, GTE | LTE | LT | GT, - EEQ | DEQ | IDF | INDF, - NEQ, + EEQ | NEQ | DEQ | IDF | INDF, AT_TIME_ZONE, (BETWEEN, AND), (NOT_BETWEEN, AND), @@ -415,10 +414,7 @@ AND, OR, ASSIGN, - NOT_REGEXP_I, - NOT_REGEXP, - REGEXP_I, - REGEXP, + NOT_REGEXP_I | NOT_REGEXP | REGEXP_I | REGEXP, ] times = ["now", "today", "tomorrow", "eod"]