mlverse · edgararuiz · Sep 17, 2024 · Sep 4, 2024 · Sep 4, 2024 · Sep 16, 2024
diff --git a/.github/workflows/spark-tests.yaml b/.github/workflows/spark-tests.yaml
@@ -26,7 +26,7 @@ jobs:
       PYSPARK_VERSION:  ${{ matrix.config.pyspark }}
       DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
       DATABRICKS_HOST: "https://rstudio-partner-posit-default.cloud.databricks.com"
-      DATABRICKS_CLUSTER_ID: "1026-175310-7cpsh3g8"
+      DATABRICKS_CLUSTER_ID: "0916-215603-ofitqny9"
 
     steps:
       - uses: actions/checkout@v3
@@ -43,10 +43,10 @@ jobs:
             any::arrow
           needs: check
 
-      - name: Set up Python 3.10
+      - name: Set up Python 3.11
         uses: actions/setup-python@v4
         with:
-          python-version: '3.10'
+          python-version: '3.11'
 
       - name: Install Venv
         run: |
@@ -62,6 +62,7 @@ jobs:
       - name: Install Spark (via sparklyr)
         if: steps.cache-spark.outputs.cache-hit != 'true'
         run: |
+          devtools::install_github("sparklyr/sparklyr")
           sparklyr::spark_install(version = Sys.getenv("SPARK_VERSION"))
           print(sparklyr::spark_install_find(Sys.getenv("SPARK_VERSION"))$sparkVersionDir)
         shell: Rscript {0}

diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
@@ -14,15 +14,15 @@ jobs:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
       DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
       DATABRICKS_HOST: "https://rstudio-partner-posit-default.cloud.databricks.com"
-      DATABRICKS_CLUSTER_ID: "1026-175310-7cpsh3g8"
+      DATABRICKS_CLUSTER_ID: "0916-215603-ofitqny9"
 
     steps:
       - uses: actions/checkout@v3
 
-      - name: Set up Python 3.10
+      - name: Set up Python 3.11
         uses: actions/setup-python@v4
         with:
-          python-version: '3.10'
+          python-version: '3.11'
 
       - uses: r-lib/actions/setup-r@v2
         with:
@@ -46,6 +46,7 @@ jobs:
       - name: Install Spark (via sparklyr)
         if: steps.cache-spark.outputs.cache-2-hit != 'true'
         run: |
+          devtools::install_github("sparklyr/sparklyr")
           sparklyr::spark_install(version = "3.5.0")
         shell: Rscript {0}
 

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pysparklyr
 Title: Provides a 'PySpark' Back-End for the 'sparklyr' Package
-Version: 0.1.5.9000
+Version: 0.1.5.9001
 Authors@R: c(
     person("Edgar", "Ruiz", , "[email protected]", role = c("aut", "cre")),
     person(given = "Posit Software, PBC", role = c("cph", "fnd"))

diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,11 @@
 
 * Adds IDE check for positron (#121)
 
+* Avoids failure when an unexpected error from Databricks is returned (#123)
+
+* No longer install 'rpy2' by default. It will prompt user for installation
+the first time `spark_apply()` is called (#125)
+
 # pysparklyr 0.1.5
 
 ### Improvements

diff --git a/R/databricks-utils.R b/R/databricks-utils.R
@@ -206,12 +206,17 @@ databricks_dbr_error <- function(error) {
   }
 
   status_tip <- NULL
-  if (grepl("UNAVAILABLE", status_error)) {
-    status_tip <- "Possible cause = The cluster is not running, or not accessible"
-  }
-  if (grepl("FAILED_PRECONDITION", status_error)) {
-    status_tip <- "Possible cause = The cluster is initializing. Try again later"
+  if(!is.null(status_error)){
+    if (grepl("UNAVAILABLE", status_error)) {
+      status_tip <- "Possible cause = The cluster is not running, or not accessible"
+    }
+    if (grepl("FAILED_PRECONDITION", status_error)) {
+      status_tip <- "Possible cause = The cluster is initializing. Try again later"
+    }
+  } else {
+    status_error <- error
   }
+
   rlang::abort(
     c(
       "Spark connection error",

diff --git a/R/python-install.R b/R/python-install.R
@@ -217,8 +217,7 @@ install_environment <- function(
     "PyArrow",
     "grpcio",
     "google-api-python-client",
-    "grpcio_status",
-    "rpy2"
+    "grpcio_status"
   )
 
   if (add_torch && install_ml) {

diff --git a/R/sparklyr-spark-apply.R b/R/sparklyr-spark-apply.R
@@ -14,10 +14,7 @@ spark_apply.tbl_pyspark <- function(
     arrow_max_records_per_batch = NULL,
     auto_deps = FALSE,
     ...) {
-  py_check_installed(
-    libraries = "rpy2",
-    msg = "Requires an additional Python library"
-  )
+  rpy2_installed()
   cli_div(theme = cli_colors())
   if (!is.null(packages)) {
     cli_abort("`packages` is not yet supported for this backend")
@@ -209,3 +206,11 @@ sa_function_to_string <- function(
   }
   ret
 }
+
+rpy2_installed <- function(envname = NULL) {
+  py_check_installed(
+    envname = envname,
+    libraries = "rpy2",
+    msg = "Required 'rpy2' Python library is missing"
+  )
+}
diff --git a/tests/testthat/_snaps/databricks-utils.md b/tests/testthat/_snaps/databricks-utils.md
@@ -0,0 +1,12 @@
+# DBR error code returns as expected
+
+    Spark connection error
+    * Possible cause = The cluster is not running, or not accessible
+    * status = StatusCode.UNAVAILABLE
+    * details = 'RESOURCE_DOES_NOT_EXIST: No cluster found matching: asdfasdf'
+
+---
+
+    Spark connection error
+    * 
+
diff --git a/tests/testthat/_snaps/python-install.md b/tests/testthat/_snaps/python-install.md
@@ -4,11 +4,11 @@
       x
     Output
       $packages
-       [1] "pyspark==3.5.0"           "pandas!=2.1.0"           
-       [3] "PyArrow"                  "grpcio"                  
-       [5] "google-api-python-client" "grpcio_status"           
-       [7] "rpy2"                     "torch"                   
-       [9] "torcheval"                "scikit-learn"            
+      [1] "pyspark==3.5.0"           "pandas!=2.1.0"           
+      [3] "PyArrow"                  "grpcio"                  
+      [5] "google-api-python-client" "grpcio_status"           
+      [7] "torch"                    "torcheval"               
+      [9] "scikit-learn"            
 
       $envname
                    unavailable 
@@ -37,7 +37,6 @@
       [1] "pyspark==3.5.*"           "pandas!=2.1.0"           
       [3] "PyArrow"                  "grpcio"                  
       [5] "google-api-python-client" "grpcio_status"           
-      [7] "rpy2"                    
 
       $envname
                    unavailable 
@@ -57,81 +56,3 @@
     Output
       [1] "pysparklyr:::install_environment(a = 1)"
 
-# Databricks installation works
-
-    Code
-      out
-    Output
-      $main_library
-      [1] "databricks-connect"
-
-      $spark_method
-      [1] "databricks_connect"
-
-      $backend
-      [1] "databricks"
-
-      $ml_version
-      [1] "14.1"
-
-      $version
-      [1] "14.1"
-
-      $envname
-      NULL
-
-      $python_version
-      NULL
-
-      $new_env
-      [1] TRUE
-
-      $method
-      [1] "auto"       "virtualenv" "conda"     
-
-      $as_job
-      [1] TRUE
-
-      $install_ml
-      [1] FALSE
-
-
----
-
-    Code
-      install_databricks(version = "13.1")
-    Output
-      $main_library
-      [1] "databricks-connect"
-
-      $spark_method
-      [1] "databricks_connect"
-
-      $backend
-      [1] "databricks"
-
-      $ml_version
-      [1] "14.1"
-
-      $version
-      [1] "13.1"
-
-      $envname
-      NULL
-
-      $python_version
-      NULL
-
-      $new_env
-      [1] TRUE
-
-      $method
-      [1] "auto"       "virtualenv" "conda"     
-
-      $as_job
-      [1] TRUE
-
-      $install_ml
-      [1] FALSE
-
-
diff --git a/tests/testthat/test-databricks-utils.R b/tests/testthat/test-databricks-utils.R
@@ -1,5 +1,3 @@
-skip_if_not_databricks()
-
 test_that("DBR error code returns as expected", {
   error <- paste0(
     "SparkConnectGrpcException('<_InactiveRpcError of RPC that terminated with:",
@@ -10,9 +8,13 @@ test_that("DBR error code returns as expected", {
     " created_time:'2023-10-02T12:14:52.379226-05:00'}'\n>')"
   )
 
-  expect_error(databricks_dbr_error(error))
+  expect_snapshot_error(databricks_dbr_error(error))
+
+  expect_snapshot_error(databricks_dbr_error(""))
 })
 
+skip_if_not_databricks()
+
 test_that("Databricks Host works", {
   expect_true(nchar(databricks_host()) > 5)
 

diff --git a/tests/testthat/test-sparklyr-spark-apply.R b/tests/testthat/test-sparklyr-spark-apply.R
@@ -1,4 +1,5 @@
 test_that("spark_apply() works", {
+  py_install("rpy2")
   tbl_mtcars <- use_test_table_mtcars()
   expect_s3_class(
     spark_apply(tbl_mtcars, nrow, group_by = "am", columns = "am double, x long"),