From cfaad790622b0c68ad4d592be5d2f91e39396ca8 Mon Sep 17 00:00:00 2001 From: Kyle Lahnakoski Date: Wed, 26 Jun 2024 21:45:43 -0400 Subject: [PATCH 1/5] better ident parsing --- mo_sql_parsing/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mo_sql_parsing/utils.py b/mo_sql_parsing/utils.py index 620174d..8439e03 100644 --- a/mo_sql_parsing/utils.py +++ b/mo_sql_parsing/utils.py @@ -908,8 +908,9 @@ def no_dashes(tokens, start, string): digit = Char("0123456789") with whitespaces.NO_WHITESPACE: - ident_w_dash = Char(FIRST_IDENT_CHAR) + (Regex("(?<=[^ 0-9])\\-(?=[^ 0-9])") | Char(IDENT_CHAR))[...] - ident_w_dash_warning = Regex(ident_w_dash.__regex__()[1]).set_parser_name("identifier_with_dashes") / no_dashes + # repack the expression into a regex for faster parsing ident_w_dash + ident_w_dash = Regex((Char(FIRST_IDENT_CHAR) + (Regex("(?<=[^ 0-9])\\-(?=[^ 0-9])") | Char(IDENT_CHAR))[...]).__regex__()[1]) + ident_w_dash_warning = ident_w_dash.set_parser_name("identifier_with_dashes") / no_dashes simple_ident = Word(FIRST_IDENT_CHAR, IDENT_CHAR).set_parser_name("identifier") sqlserver_local_ident = Word("@" + FIRST_IDENT_CHAR, IDENT_CHAR).set_parser_name("identifier") From 076bddb4a0395bea89e727064b9d47bafa75c7a7 Mon Sep 17 00:00:00 2001 From: Kyle Lahnakoski Date: Wed, 26 Jun 2024 21:58:47 -0400 Subject: [PATCH 2/5] docs --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index dce7980..9f2bdfe 100644 --- a/README.md +++ b/README.md @@ -48,13 +48,13 @@ The `SELECT` clause is an array of objects containing `name` and `value` propert There are a few parsing modes you may be interested in: -#### Double-quotes for literal strings +**Double-quotes for literal strings** MySQL uses both double quotes and single quotes to declare literal strings. This is not ansi behaviour, but it is more forgiving for programmers coming from other languages. A specific parse function is provided: result = parse_mysql(sql) -#### SQLServer Identifiers (`[]`) +**SQLServer Identifiers (`[]`)** SQLServer uses square brackets to delimit identifiers. For example @@ -64,7 +64,7 @@ which conflicts with BigQuery array constructor (eg `[1, 2, 3, 4]`). You may use from mo_sql_parsing import parse_sqlserver as parse -#### NULL is None +**NULL is None** The default output for this parser is to emit a null function `{"null":{}}` wherever `NULL` is encountered in the SQL. If you would like something different, you can replace nulls with `None` (or anything else for that matter): @@ -73,7 +73,7 @@ The default output for this parser is to emit a null function `{"null":{}}` wher this has been implemented with a post-parse rewriting of the parse tree. -#### Normalized function call form +**Normalized function call form** The default behaviour of the parser is to output function calls in `simple_op` format: The operator being a key in the object; `{op: params}`. This form can be difficult to work with because the object must be scanned for known operators, or possible optional arguments, or at least distinguished from a query object. From 584eb5707378161d64608e75c5343b4c766fa3a7 Mon Sep 17 00:00:00 2001 From: Kyle Lahnakoski Date: Wed, 26 Jun 2024 22:01:59 -0400 Subject: [PATCH 3/5] revert --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9f2bdfe..dce7980 100644 --- a/README.md +++ b/README.md @@ -48,13 +48,13 @@ The `SELECT` clause is an array of objects containing `name` and `value` propert There are a few parsing modes you may be interested in: -**Double-quotes for literal strings** +#### Double-quotes for literal strings MySQL uses both double quotes and single quotes to declare literal strings. This is not ansi behaviour, but it is more forgiving for programmers coming from other languages. A specific parse function is provided: result = parse_mysql(sql) -**SQLServer Identifiers (`[]`)** +#### SQLServer Identifiers (`[]`) SQLServer uses square brackets to delimit identifiers. For example @@ -64,7 +64,7 @@ which conflicts with BigQuery array constructor (eg `[1, 2, 3, 4]`). You may use from mo_sql_parsing import parse_sqlserver as parse -**NULL is None** +#### NULL is None The default output for this parser is to emit a null function `{"null":{}}` wherever `NULL` is encountered in the SQL. If you would like something different, you can replace nulls with `None` (or anything else for that matter): @@ -73,7 +73,7 @@ The default output for this parser is to emit a null function `{"null":{}}` wher this has been implemented with a post-parse rewriting of the parse tree. -**Normalized function call form** +#### Normalized function call form The default behaviour of the parser is to output function calls in `simple_op` format: The operator being a key in the object; `{op: params}`. This form can be difficult to work with because the object must be scanned for known operators, or possible optional arguments, or at least distinguished from a query object. From 9939e2d016265cd1f6cf8deb88f5e4fe0d87f6cd Mon Sep 17 00:00:00 2001 From: Steven Ayers Date: Tue, 30 Jul 2024 13:02:37 +0100 Subject: [PATCH 4/5] Support for ENCODE, DISTKEY and SORTKEY in Redshift (#245) --- mo_sql_parsing/sql_parser.py | 5 ++ mo_sql_parsing/types.py | 1 + tests/test_redshift.py | 152 +++++++++++++++++++++++++++++++++++ 3 files changed, 158 insertions(+) diff --git a/mo_sql_parsing/sql_parser.py b/mo_sql_parsing/sql_parser.py index 40c3eaa..50c35e6 100644 --- a/mo_sql_parsing/sql_parser.py +++ b/mo_sql_parsing/sql_parser.py @@ -704,6 +704,11 @@ def mult(tokens): ) + Optional(AS.suppress() + infix_notation(query, [])("query")) + Optional(CLUSTER_BY.suppress() + LB + delimited_list(identifier) + RB)("cluster_by") + + ZeroOrMore( + assign("sortkey", LB + delimited_list(identifier) + RB) + | assign("distkey", LB + identifier + RB) + ) + )("create table") definer = Optional(keyword("definer").suppress() + EQ + identifier("definer")) diff --git a/mo_sql_parsing/types.py b/mo_sql_parsing/types.py index 849a51d..a36871f 100644 --- a/mo_sql_parsing/types.py +++ b/mo_sql_parsing/types.py @@ -259,6 +259,7 @@ def get_column_type(expr, identifier, literal_string): | flag("auto_increment") | flag("autoincrement") | assign("comment", literal_string) + | assign("encode", identifier) | assign("character set", identifier) | assign("collate", Optional(EQ) + identifier) | flag("primary key") diff --git a/tests/test_redshift.py b/tests/test_redshift.py index 316f7ee..553ff6c 100644 --- a/tests/test_redshift.py +++ b/tests/test_redshift.py @@ -13,6 +13,158 @@ class TestRedshift(TestCase): + def test_issue245a_encode(self): + sql = f""" + CREATE TABLE IF NOT EXISTS web.customers + ( + customer_number INT ENCODE LZO, + metadata SUPER ENCODE LZO, + email VARCHAR(255) ENCODE LZO, + created_at TIMESTAMP ENCODE AZ64, + last_logged_in_at TIMESTAMP ENCODE AZ64 + ) + """ + result = parse(sql) + self.assertEqual( + result, { + "create table": { + "replace": False, + "name": "web.customers", + "columns": [ + {"name": "customer_number", "type": {"int": {}}, "encode": "LZO"}, + {"name": "metadata", "type": "SUPER", "encode": "LZO"}, + {"name": "email", "type": {"varchar": 255}, "encode": "LZO"}, + {"name": "created_at", "type": {"timestamp": {}}, "encode": "AZ64"}, + {"name": "last_logged_in_at", "type": {"timestamp": {}}, "encode": "AZ64"} + ] + } + } + ) + + def test_issue245b_single_sortkey(self): + sql = f""" + CREATE TABLE IF NOT EXISTS web.customers + ( + customer_number INT, + metadata SUPER, + email VARCHAR(255), + created_at TIMESTAMP, + last_logged_in_at TIMESTAMP + ) + SORTKEY (customer_number) + """ + result = parse(sql) + self.assertEqual( + result, { + "create table": { + "replace": False, + "name": "web.customers", + "columns": [ + {"name": "customer_number", "type": {"int": {}}}, + {"name": "metadata", "type": "SUPER"}, + {"name": "email", "type": {"varchar": 255}}, + {"name": "created_at", "type": {"timestamp": {}}}, + {"name": "last_logged_in_at", "type": {"timestamp": {}}} + ], + "sortkey": "customer_number" + } + } + ) + + def test_issue245c_multiple_sortkeys(self): + sql = f""" + CREATE TABLE IF NOT EXISTS web.customers + ( + customer_number INT, + metadata SUPER, + email VARCHAR(255), + created_at TIMESTAMP, + last_logged_in_at TIMESTAMP + ) + SORTKEY(customer_number, email) + """ + result = parse(sql) + self.assertEqual( + result, { + "create table": { + "replace": False, + "name": "web.customers", + "columns": [ + {"name": "customer_number", "type": {"int": {}}}, + {"name": "metadata", "type": "SUPER"}, + {"name": "email", "type": {"varchar": 255}}, + {"name": "created_at", "type": {"timestamp": {}}}, + {"name": "last_logged_in_at", "type": {"timestamp": {}}} + ], + "sortkey": [ + "customer_number", "email" + ] + } + } + ) + + def test_issue245d_distkey(self): + sql = f""" + CREATE TABLE IF NOT EXISTS web.customers + ( + customer_number INT, + metadata SUPER, + email VARCHAR(255), + created_at TIMESTAMP, + last_logged_in_at TIMESTAMP + ) + DISTKEY(customer_number) + """ + result = parse(sql) + self.assertEqual( + result, { + "create table": { + "replace": False, + "name": "web.customers", + "columns": [ + {"name": "customer_number", "type": {"int": {}}}, + {"name": "metadata", "type": "SUPER"}, + {"name": "email", "type": {"varchar": 255}}, + {"name": "created_at", "type": {"timestamp": {}}}, + {"name": "last_logged_in_at", "type": {"timestamp": {}}} + ], + "distkey": "customer_number" + } + } + ) + + def test_issue245e_combination(self): + sql = f""" + CREATE TABLE IF NOT EXISTS web.customers + ( + customer_number INT ENCODE LZO, + metadata SUPER ENCODE LZO, + email VARCHAR(255) ENCODE LZO, + created_at TIMESTAMP ENCODE AZ64, + last_logged_in_at TIMESTAMP ENCODE AZ64 + ) + DISTKEY (customer_number) + SORTKEY(created_at) + """ + result = parse(sql) + self.assertEqual( + result, { + "create table": { + "replace": False, + "name": "web.customers", + "columns": [ + {"name": "customer_number", "type": {"int": {}}, "encode": "LZO"}, + {"name": "metadata", "type": "SUPER", "encode": "LZO"}, + {"name": "email", "type": {"varchar": 255}, "encode": "LZO"}, + {"name": "created_at", "type": {"timestamp": {}}, "encode": "AZ64"}, + {"name": "last_logged_in_at", "type": {"timestamp": {}}, "encode": "AZ64"} + ], + "distkey": "customer_number", + "sortkey": "created_at" + } + } + ) + def test_issue149a_casting(self): # Ref: https://docs.aws.amazon.com/redshift/latest/dg/r_CAST_function.html#r_CAST_function-examples sql = "select '' :: varchar as placeholder from table" From ed8a1bd9f35c9c85f7d43ec9033be7fe5ed98805 Mon Sep 17 00:00:00 2001 From: Kyle Lahnakoski Date: Wed, 31 Jul 2024 20:07:47 -0400 Subject: [PATCH 5/5] update version number --- packaging/setup.py | 2 +- packaging/setuptools.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packaging/setup.py b/packaging/setup.py index e2aac6f..1ff2ed8 100644 --- a/packaging/setup.py +++ b/packaging/setup.py @@ -15,6 +15,6 @@ name='mo-sql-parsing', packages=["mo_sql_parsing"], url='https://github.com/klahnakoski/mo-sql-parsing', - version='10.651.24172', + version='10.652.24214', zip_safe=True ) \ No newline at end of file diff --git a/packaging/setuptools.json b/packaging/setuptools.json index 183f249..1bb1afb 100644 --- a/packaging/setuptools.json +++ b/packaging/setuptools.json @@ -312,6 +312,6 @@ "name": "mo-sql-parsing", "packages": ["mo_sql_parsing"], "url": "https://github.com/klahnakoski/mo-sql-parsing", - "version": "10.651.24172", + "version": "10.652.24214", "zip_safe": true } \ No newline at end of file