From bf003762f2ce9cefd09746dd336693a84c18ccd4 Mon Sep 17 00:00:00 2001 From: Athanasios Keramas Date: Thu, 9 May 2024 14:42:18 +0300 Subject: [PATCH 01/12] Prune sparkConnect SQL stack trace on AnalysisException --- src/sql/run/sparkdataframe.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/sql/run/sparkdataframe.py b/src/sql/run/sparkdataframe.py index 81644b1e2..2781152a5 100644 --- a/src/sql/run/sparkdataframe.py +++ b/src/sql/run/sparkdataframe.py @@ -1,9 +1,11 @@ try: from pyspark.sql import DataFrame from pyspark.sql.connect.dataframe import DataFrame as CDataFrame + from pyspark.sql.utils import AnalysisException except ModuleNotFoundError: DataFrame = None CDataFrame = None + AnalysisException = None from sql import exceptions @@ -13,8 +15,13 @@ def handle_spark_dataframe(dataframe, should_cache=False): if not DataFrame and not CDataFrame: raise exceptions.MissingPackageError("pysark not installed") - return SparkResultProxy(dataframe, dataframe.columns, should_cache) - + try: + return SparkResultProxy(dataframe, dataframe.columns, should_cache) + except AnalysisException as e: + print(e) + except Exception as e: + print(e) + raise (e) class SparkResultProxy(object): """A fake class that pretends to behave like the ResultProxy from From 1143d25f3d9fd5c52f6802554923527746494895 Mon Sep 17 00:00:00 2001 From: Athanasios Keramas Date: Thu, 9 May 2024 15:50:21 +0300 Subject: [PATCH 02/12] Add AnalysisException on MissingPackageError check --- src/sql/run/sparkdataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sql/run/sparkdataframe.py b/src/sql/run/sparkdataframe.py index 2781152a5..569008b0b 100644 --- a/src/sql/run/sparkdataframe.py +++ b/src/sql/run/sparkdataframe.py @@ -12,7 +12,7 @@ def handle_spark_dataframe(dataframe, should_cache=False): """Execute a ResultSet sqlaproxy using pysark module.""" - if not DataFrame and not CDataFrame: + if not DataFrame and not CDataFrame and not AnalysisException: raise exceptions.MissingPackageError("pysark not installed") try: From d7e0e8f246c953bbf3568f6e008c2fff8ea3026d Mon Sep 17 00:00:00 2001 From: Athanasios Keramas Date: Fri, 10 May 2024 13:24:56 +0300 Subject: [PATCH 03/12] Lint --- src/sql/run/sparkdataframe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sql/run/sparkdataframe.py b/src/sql/run/sparkdataframe.py index 569008b0b..a73d2e69f 100644 --- a/src/sql/run/sparkdataframe.py +++ b/src/sql/run/sparkdataframe.py @@ -21,7 +21,8 @@ def handle_spark_dataframe(dataframe, should_cache=False): print(e) except Exception as e: print(e) - raise (e) + raise (e) + class SparkResultProxy(object): """A fake class that pretends to behave like the ResultProxy from From 11bedfa8662850f97168ebff72532219f4aad530 Mon Sep 17 00:00:00 2001 From: Athanasios Keramas Date: Thu, 23 May 2024 10:18:25 +0300 Subject: [PATCH 04/12] Revert changes --- src/sql/run/sparkdataframe.py | 33 ++++++++------------------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/src/sql/run/sparkdataframe.py b/src/sql/run/sparkdataframe.py index a73d2e69f..762c69cb1 100644 --- a/src/sql/run/sparkdataframe.py +++ b/src/sql/run/sparkdataframe.py @@ -1,36 +1,26 @@ -try: - from pyspark.sql import DataFrame - from pyspark.sql.connect.dataframe import DataFrame as CDataFrame - from pyspark.sql.utils import AnalysisException -except ModuleNotFoundError: - DataFrame = None - CDataFrame = None - AnalysisException = None +# try: +from pyspark.sql import DataFrame +from pyspark.sql.connect.dataframe import DataFrame as CDataFrame +# except ModuleNotFoundError: +# DataFrame = None +# CDataFrame = None from sql import exceptions def handle_spark_dataframe(dataframe, should_cache=False): """Execute a ResultSet sqlaproxy using pysark module.""" - if not DataFrame and not CDataFrame and not AnalysisException: + if not DataFrame and not CDataFrame: raise exceptions.MissingPackageError("pysark not installed") - try: - return SparkResultProxy(dataframe, dataframe.columns, should_cache) - except AnalysisException as e: - print(e) - except Exception as e: - print(e) - raise (e) + return SparkResultProxy(dataframe, dataframe.columns, should_cache) class SparkResultProxy(object): """A fake class that pretends to behave like the ResultProxy from SqlAlchemy. """ - dataframe = None - def __init__(self, dataframe, headers, should_cache): self.dataframe = dataframe self.fetchall = dataframe.collect @@ -40,21 +30,14 @@ def __init__(self, dataframe, headers, should_cache): self.returns_rows = True if should_cache: self.dataframe.cache() - def fetchmany(self, size): return self.dataframe.take(size) - def fetchone(self): return self.dataframe.head() - def close(self): self.dataframe.unpersist() - - class SparkCursor(object): """Class to extend to give SqlAlchemy Cursor like behaviour""" - description = None - def __init__(self, headers) -> None: self.description = headers From 7578d8c9e2cd7a83dd47fa695fcc0681854a30a5 Mon Sep 17 00:00:00 2001 From: Athanasios Keramas Date: Thu, 23 May 2024 10:18:44 +0300 Subject: [PATCH 05/12] Fix typo --- src/sql/run/sparkdataframe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sql/run/sparkdataframe.py b/src/sql/run/sparkdataframe.py index 762c69cb1..e47c0d318 100644 --- a/src/sql/run/sparkdataframe.py +++ b/src/sql/run/sparkdataframe.py @@ -9,9 +9,9 @@ def handle_spark_dataframe(dataframe, should_cache=False): - """Execute a ResultSet sqlaproxy using pysark module.""" + """Execute a ResultSet sqlaproxy using pyspark module.""" if not DataFrame and not CDataFrame: - raise exceptions.MissingPackageError("pysark not installed") + raise exceptions.MissingPackageError("pyspark not installed") return SparkResultProxy(dataframe, dataframe.columns, should_cache) From 7102f97782f20ec347247d7c6b71bf88c0776bb2 Mon Sep 17 00:00:00 2001 From: Athanasios Keramas Date: Thu, 23 May 2024 10:21:08 +0300 Subject: [PATCH 06/12] Add AnalysisException to is_non_sqlalchemy_error --- src/sql/util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sql/util.py b/src/sql/util.py index 74af08c32..59bac5e50 100644 --- a/src/sql/util.py +++ b/src/sql/util.py @@ -559,6 +559,7 @@ def is_non_sqlalchemy_error(error): # Pyspark "UNRESOLVED_ROUTINE", "PARSE_SYNTAX_ERROR", + "AnalysisException", ] return any(msg in str(error) for msg in specific_db_errors) From 48c16234d694316d6146af6bf14fea24db179d19 Mon Sep 17 00:00:00 2001 From: Athanasios Keramas Date: Thu, 23 May 2024 10:33:58 +0300 Subject: [PATCH 07/12] Lint --- src/sql/run/sparkdataframe.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/sql/run/sparkdataframe.py b/src/sql/run/sparkdataframe.py index e47c0d318..e55d76f0a 100644 --- a/src/sql/run/sparkdataframe.py +++ b/src/sql/run/sparkdataframe.py @@ -1,26 +1,27 @@ -# try: -from pyspark.sql import DataFrame -from pyspark.sql.connect.dataframe import DataFrame as CDataFrame -# except ModuleNotFoundError: -# DataFrame = None -# CDataFrame = None +try: + from pyspark.sql import DataFrame + from pyspark.sql.connect.dataframe import DataFrame as CDataFrame +except ModuleNotFoundError: + DataFrame = None + CDataFrame = None from sql import exceptions def handle_spark_dataframe(dataframe, should_cache=False): - """Execute a ResultSet sqlaproxy using pyspark module.""" + """Execute a ResultSet sqlaproxy using pysark module.""" if not DataFrame and not CDataFrame: - raise exceptions.MissingPackageError("pyspark not installed") + raise exceptions.MissingPackageError("pysark not installed") return SparkResultProxy(dataframe, dataframe.columns, should_cache) - class SparkResultProxy(object): """A fake class that pretends to behave like the ResultProxy from SqlAlchemy. """ + dataframe = None + def __init__(self, dataframe, headers, should_cache): self.dataframe = dataframe self.fetchall = dataframe.collect @@ -30,14 +31,21 @@ def __init__(self, dataframe, headers, should_cache): self.returns_rows = True if should_cache: self.dataframe.cache() + def fetchmany(self, size): return self.dataframe.take(size) + def fetchone(self): return self.dataframe.head() + def close(self): self.dataframe.unpersist() + + class SparkCursor(object): """Class to extend to give SqlAlchemy Cursor like behaviour""" + description = None + def __init__(self, headers) -> None: self.description = headers From d951b42c7c92963b6ca394a7036a0bb74627cc7c Mon Sep 17 00:00:00 2001 From: Athanasios Keramas Date: Thu, 23 May 2024 10:35:05 +0300 Subject: [PATCH 08/12] Fix typo --- src/sql/run/sparkdataframe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/sql/run/sparkdataframe.py b/src/sql/run/sparkdataframe.py index e55d76f0a..995193776 100644 --- a/src/sql/run/sparkdataframe.py +++ b/src/sql/run/sparkdataframe.py @@ -9,12 +9,13 @@ def handle_spark_dataframe(dataframe, should_cache=False): - """Execute a ResultSet sqlaproxy using pysark module.""" + """Execute a ResultSet sqlaproxy using pyspark module.""" if not DataFrame and not CDataFrame: - raise exceptions.MissingPackageError("pysark not installed") + raise exceptions.MissingPackageError("pyspark not installed") return SparkResultProxy(dataframe, dataframe.columns, should_cache) + class SparkResultProxy(object): """A fake class that pretends to behave like the ResultProxy from SqlAlchemy. From e95d599aa729f74157c5193fe20c5f638dc96be1 Mon Sep 17 00:00:00 2001 From: Athanasios Keramas Date: Mon, 27 May 2024 12:18:50 +0300 Subject: [PATCH 09/12] Update is_non_sqlalchemy_error to support pysparks AnalysisException --- src/sql/util.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/sql/util.py b/src/sql/util.py index 59bac5e50..4eadcf1f8 100644 --- a/src/sql/util.py +++ b/src/sql/util.py @@ -7,6 +7,12 @@ from sqlglot.errors import ParseError from sqlalchemy.exc import SQLAlchemyError from ploomber_core.dependencies import requires + +try: + from pyspark.sql.utils import AnalysisException +except ModuleNotFoundError: + AnalysisException = None + import ast from os.path import isfile import re @@ -556,12 +562,14 @@ def is_non_sqlalchemy_error(error): "pyodbc.ProgrammingError", # Clickhouse errors "DB::Exception:", - # Pyspark - "UNRESOLVED_ROUTINE", - "PARSE_SYNTAX_ERROR", - "AnalysisException", ] - return any(msg in str(error) for msg in specific_db_errors) + is_pyspark_analysis_exception = ( + isinstance(error, AnalysisException) if AnalysisException else False + ) + return ( + any(msg in str(error) for msg in specific_db_errors) + or is_pyspark_analysis_exception + ) def if_substring_exists(string, substrings): From 1dcfb4f0c913fd07b7b9f33ea06aaeac937af62b Mon Sep 17 00:00:00 2001 From: Athanasios Keramas Date: Mon, 8 Jul 2024 17:55:52 +0300 Subject: [PATCH 10/12] Add Changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c231fd6f8..8461ba1b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## 0.10.11dev +* [Feature] Disable full stackt race when using spark connect ([#1011](https://github.com/ploomber/jupysql/issues/1011)) (by [@b1ackout](https://github.com/b1ackout)) + ## 0.10.10 (2024-02-07) * [Feature] Adds `ploomber-extension` as a dependency From 4d2e8c48fa8e2f4e53bf73b570e620eba03d6ee7 Mon Sep 17 00:00:00 2001 From: Athanasios Keramas Date: Mon, 2 Sep 2024 13:41:48 +0300 Subject: [PATCH 11/12] Fix typo --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8461ba1b6..c230fbe8b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ ## 0.10.11dev -* [Feature] Disable full stackt race when using spark connect ([#1011](https://github.com/ploomber/jupysql/issues/1011)) (by [@b1ackout](https://github.com/b1ackout)) +* [Feature] Disable full stack trace when using spark connect ([#1011](https://github.com/ploomber/jupysql/issues/1011)) (by [@b1ackout](https://github.com/b1ackout)) ## 0.10.10 (2024-02-07) From 73fc1d14e488aea2c7189cd5b447d91d22217053 Mon Sep 17 00:00:00 2001 From: Athanasios Keramas Date: Mon, 2 Sep 2024 13:53:51 +0300 Subject: [PATCH 12/12] Update changelog --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e6a4b950..71897e8ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## 0.10.13dev +* [Feature] Disable full stack trace when using spark connect ([#1011](https://github.com/ploomber/jupysql/issues/1011)) (by [@b1ackout](https://github.com/b1ackout)) + ## 0.10.12 (2024-07-12) * [Feature] Remove sqlalchemy upper bound ([#1020](https://github.com/ploomber/jupysql/pull/1020)) @@ -10,8 +12,6 @@ * [Fix] Fix error when connections.ini contains a `query` value as dictionary ([#1015](https://github.com/ploomber/jupysql/issues/1015)) -* [Feature] Disable full stack trace when using spark connect ([#1011](https://github.com/ploomber/jupysql/issues/1011)) (by [@b1ackout](https://github.com/b1ackout)) - ## 0.10.10 (2024-02-07) * [Feature] Adds `ploomber-extension` as a dependency