From b6ddd85533a842b23bcc0201636059fcc08bc127 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Wed, 4 Sep 2024 10:46:20 -0500 Subject: [PATCH 1/7] Fixes #123 --- R/databricks-utils.R | 15 ++++++++++----- tests/testthat/_snaps/databricks-utils.md | 12 ++++++++++++ tests/testthat/test-databricks-utils.R | 8 +++++--- 3 files changed, 27 insertions(+), 8 deletions(-) create mode 100644 tests/testthat/_snaps/databricks-utils.md diff --git a/R/databricks-utils.R b/R/databricks-utils.R index d87999f..57eb233 100644 --- a/R/databricks-utils.R +++ b/R/databricks-utils.R @@ -206,12 +206,17 @@ databricks_dbr_error <- function(error) { } status_tip <- NULL - if (grepl("UNAVAILABLE", status_error)) { - status_tip <- "Possible cause = The cluster is not running, or not accessible" - } - if (grepl("FAILED_PRECONDITION", status_error)) { - status_tip <- "Possible cause = The cluster is initializing. Try again later" + if(!is.null(status_error)){ + if (grepl("UNAVAILABLE", status_error)) { + status_tip <- "Possible cause = The cluster is not running, or not accessible" + } + if (grepl("FAILED_PRECONDITION", status_error)) { + status_tip <- "Possible cause = The cluster is initializing. Try again later" + } + } else { + status_error <- error } + rlang::abort( c( "Spark connection error", diff --git a/tests/testthat/_snaps/databricks-utils.md b/tests/testthat/_snaps/databricks-utils.md new file mode 100644 index 0000000..d596d35 --- /dev/null +++ b/tests/testthat/_snaps/databricks-utils.md @@ -0,0 +1,12 @@ +# DBR error code returns as expected + + Spark connection error + * Possible cause = The cluster is not running, or not accessible + * status = StatusCode.UNAVAILABLE + * details = 'RESOURCE_DOES_NOT_EXIST: No cluster found matching: asdfasdf' + +--- + + Spark connection error + * + diff --git a/tests/testthat/test-databricks-utils.R b/tests/testthat/test-databricks-utils.R index f86c0bb..8b5e557 100644 --- a/tests/testthat/test-databricks-utils.R +++ b/tests/testthat/test-databricks-utils.R @@ -1,5 +1,3 @@ -skip_if_not_databricks() - test_that("DBR error code returns as expected", { error <- paste0( "SparkConnectGrpcException('<_InactiveRpcError of RPC that terminated with:", @@ -10,9 +8,13 @@ test_that("DBR error code returns as expected", { " created_time:'2023-10-02T12:14:52.379226-05:00'}'\n>')" ) - expect_error(databricks_dbr_error(error)) + expect_snapshot_error(databricks_dbr_error(error)) + + expect_snapshot_error(databricks_dbr_error("")) }) +skip_if_not_databricks() + test_that("Databricks Host works", { expect_true(nchar(databricks_host()) > 5) From d92f8ccb8239cc282d2bbc7740bef678bb61bac5 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Wed, 4 Sep 2024 10:48:26 -0500 Subject: [PATCH 2/7] Adds NEWS item, ver bump --- DESCRIPTION | 2 +- NEWS.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6fae540..7fd70ed 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pysparklyr Title: Provides a 'PySpark' Back-End for the 'sparklyr' Package -Version: 0.1.5.9000 +Version: 0.1.5.9001 Authors@R: c( person("Edgar", "Ruiz", , "edgar@posit.co", role = c("aut", "cre")), person(given = "Posit Software, PBC", role = c("cph", "fnd")) diff --git a/NEWS.md b/NEWS.md index 325fa46..06bf816 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,8 @@ * Adds IDE check for positron (#121) +* Avoids failure when an unexpected error from Databricks is returned (#123) + # pysparklyr 0.1.5 ### Improvements From 21c9ca00ff3c76cd00a590ba3015c55a16a384be Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Mon, 16 Sep 2024 16:14:28 -0500 Subject: [PATCH 3/7] Fixes #125 --- R/python-install.R | 3 +-- R/sparklyr-spark-apply.R | 13 +++++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/R/python-install.R b/R/python-install.R index 3a4e00e..0fafe49 100644 --- a/R/python-install.R +++ b/R/python-install.R @@ -217,8 +217,7 @@ install_environment <- function( "PyArrow", "grpcio", "google-api-python-client", - "grpcio_status", - "rpy2" + "grpcio_status" ) if (add_torch && install_ml) { diff --git a/R/sparklyr-spark-apply.R b/R/sparklyr-spark-apply.R index afdbdb9..afc7cdc 100644 --- a/R/sparklyr-spark-apply.R +++ b/R/sparklyr-spark-apply.R @@ -14,10 +14,7 @@ spark_apply.tbl_pyspark <- function( arrow_max_records_per_batch = NULL, auto_deps = FALSE, ...) { - py_check_installed( - libraries = "rpy2", - msg = "Requires an additional Python library" - ) + rpy2_installed() cli_div(theme = cli_colors()) if (!is.null(packages)) { cli_abort("`packages` is not yet supported for this backend") @@ -209,3 +206,11 @@ sa_function_to_string <- function( } ret } + +rpy2_installed <- function(envname = NULL) { + py_check_installed( + envname = envname, + libraries = "rpy2", + msg = "Required 'rpy2' Python library is missing" + ) +} From 1bfcb556731a8fa0eb796e1e043bb927ebfba486 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Tue, 17 Sep 2024 07:48:24 -0500 Subject: [PATCH 4/7] Temporarily switches the CI jobs to dev sparklyr --- .github/workflows/spark-tests.yaml | 1 + .github/workflows/test-coverage.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/spark-tests.yaml b/.github/workflows/spark-tests.yaml index f0cd691..74b4447 100644 --- a/.github/workflows/spark-tests.yaml +++ b/.github/workflows/spark-tests.yaml @@ -62,6 +62,7 @@ jobs: - name: Install Spark (via sparklyr) if: steps.cache-spark.outputs.cache-hit != 'true' run: | + devtools::install_github("sparklyr/sparklyr") sparklyr::spark_install(version = Sys.getenv("SPARK_VERSION")) print(sparklyr::spark_install_find(Sys.getenv("SPARK_VERSION"))$sparkVersionDir) shell: Rscript {0} diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index d8f750c..89832b7 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -46,6 +46,7 @@ jobs: - name: Install Spark (via sparklyr) if: steps.cache-spark.outputs.cache-2-hit != 'true' run: | + devtools::install_github("sparklyr/sparklyr") sparklyr::spark_install(version = "3.5.0") shell: Rscript {0} From ddc8232d6c0441c44d52c7efe927ce3cbebd301b Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Tue, 17 Sep 2024 08:19:16 -0500 Subject: [PATCH 5/7] Updates cluster id, and python version for Spark tests, disable the other jobs temporarily --- .github/workflows/R-CMD-check.yaml | 2 +- .github/workflows/spark-tests.yaml | 6 +++--- .github/workflows/test-coverage.yaml | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index f2a275a..2e85aa7 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -4,7 +4,7 @@ on: push: branches: main pull_request: - branches: main + branches: temp name: R-CMD-check diff --git a/.github/workflows/spark-tests.yaml b/.github/workflows/spark-tests.yaml index 74b4447..c09e757 100644 --- a/.github/workflows/spark-tests.yaml +++ b/.github/workflows/spark-tests.yaml @@ -26,7 +26,7 @@ jobs: PYSPARK_VERSION: ${{ matrix.config.pyspark }} DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} DATABRICKS_HOST: "https://rstudio-partner-posit-default.cloud.databricks.com" - DATABRICKS_CLUSTER_ID: "1026-175310-7cpsh3g8" + DATABRICKS_CLUSTER_ID: "0916-215603-ofitqny9" steps: - uses: actions/checkout@v3 @@ -43,10 +43,10 @@ jobs: any::arrow needs: check - - name: Set up Python 3.10 + - name: Set up Python 3.11 uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: '3.11' - name: Install Venv run: | diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 89832b7..fdf4610 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -2,7 +2,7 @@ on: push: branches: main pull_request: - branches: main + branches: temp name: test-coverage @@ -14,15 +14,15 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} DATABRICKS_HOST: "https://rstudio-partner-posit-default.cloud.databricks.com" - DATABRICKS_CLUSTER_ID: "1026-175310-7cpsh3g8" + DATABRICKS_CLUSTER_ID: "0916-215603-ofitqny9" steps: - uses: actions/checkout@v3 - - name: Set up Python 3.10 + - name: Set up Python 3.11 uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: '3.11' - uses: r-lib/actions/setup-r@v2 with: From 2eae4a944f13febe8f589e227cd142ddfc6dcd04 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Tue, 17 Sep 2024 08:38:40 -0500 Subject: [PATCH 6/7] Adds rpy2 installation step, updates snapshot --- tests/testthat/_snaps/python-install.md | 89 ++-------------------- tests/testthat/test-sparklyr-spark-apply.R | 1 + 2 files changed, 6 insertions(+), 84 deletions(-) diff --git a/tests/testthat/_snaps/python-install.md b/tests/testthat/_snaps/python-install.md index 7cad6c0..f0c9ad6 100644 --- a/tests/testthat/_snaps/python-install.md +++ b/tests/testthat/_snaps/python-install.md @@ -4,11 +4,11 @@ x Output $packages - [1] "pyspark==3.5.0" "pandas!=2.1.0" - [3] "PyArrow" "grpcio" - [5] "google-api-python-client" "grpcio_status" - [7] "rpy2" "torch" - [9] "torcheval" "scikit-learn" + [1] "pyspark==3.5.0" "pandas!=2.1.0" + [3] "PyArrow" "grpcio" + [5] "google-api-python-client" "grpcio_status" + [7] "torch" "torcheval" + [9] "scikit-learn" $envname unavailable @@ -37,7 +37,6 @@ [1] "pyspark==3.5.*" "pandas!=2.1.0" [3] "PyArrow" "grpcio" [5] "google-api-python-client" "grpcio_status" - [7] "rpy2" $envname unavailable @@ -57,81 +56,3 @@ Output [1] "pysparklyr:::install_environment(a = 1)" -# Databricks installation works - - Code - out - Output - $main_library - [1] "databricks-connect" - - $spark_method - [1] "databricks_connect" - - $backend - [1] "databricks" - - $ml_version - [1] "14.1" - - $version - [1] "14.1" - - $envname - NULL - - $python_version - NULL - - $new_env - [1] TRUE - - $method - [1] "auto" "virtualenv" "conda" - - $as_job - [1] TRUE - - $install_ml - [1] FALSE - - ---- - - Code - install_databricks(version = "13.1") - Output - $main_library - [1] "databricks-connect" - - $spark_method - [1] "databricks_connect" - - $backend - [1] "databricks" - - $ml_version - [1] "14.1" - - $version - [1] "13.1" - - $envname - NULL - - $python_version - NULL - - $new_env - [1] TRUE - - $method - [1] "auto" "virtualenv" "conda" - - $as_job - [1] TRUE - - $install_ml - [1] FALSE - - diff --git a/tests/testthat/test-sparklyr-spark-apply.R b/tests/testthat/test-sparklyr-spark-apply.R index 509cb64..a48f52c 100644 --- a/tests/testthat/test-sparklyr-spark-apply.R +++ b/tests/testthat/test-sparklyr-spark-apply.R @@ -1,4 +1,5 @@ test_that("spark_apply() works", { + py_install("rpy2") tbl_mtcars <- use_test_table_mtcars() expect_s3_class( spark_apply(tbl_mtcars, nrow, group_by = "am", columns = "am double, x long"), From 13222f708bda90a187dd3c8ac3b0b9766cfcf6e9 Mon Sep 17 00:00:00 2001 From: Edgar Ruiz Date: Tue, 17 Sep 2024 08:52:38 -0500 Subject: [PATCH 7/7] Restores the other tests, adds NEWS item --- .github/workflows/R-CMD-check.yaml | 2 +- .github/workflows/test-coverage.yaml | 2 +- NEWS.md | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 2e85aa7..f2a275a 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -4,7 +4,7 @@ on: push: branches: main pull_request: - branches: temp + branches: main name: R-CMD-check diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index fdf4610..5efae0c 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -2,7 +2,7 @@ on: push: branches: main pull_request: - branches: temp + branches: main name: test-coverage diff --git a/NEWS.md b/NEWS.md index 06bf816..9d20db3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,9 @@ * Avoids failure when an unexpected error from Databricks is returned (#123) +* No longer install 'rpy2' by default. It will prompt user for installation +the first time `spark_apply()` is called (#125) + # pysparklyr 0.1.5 ### Improvements