Skip to content

Commit

Permalink
Merge pull request #124 from mlverse/updates
Browse files Browse the repository at this point in the history
A couple of fixes
  • Loading branch information
edgararuiz authored Sep 17, 2024
2 parents b3b91ea + 13222f7 commit 6ea4aab
Show file tree
Hide file tree
Showing 11 changed files with 57 additions and 105 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/spark-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
PYSPARK_VERSION: ${{ matrix.config.pyspark }}
DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
DATABRICKS_HOST: "https://rstudio-partner-posit-default.cloud.databricks.com"
DATABRICKS_CLUSTER_ID: "1026-175310-7cpsh3g8"
DATABRICKS_CLUSTER_ID: "0916-215603-ofitqny9"

steps:
- uses: actions/checkout@v3
Expand All @@ -43,10 +43,10 @@ jobs:
any::arrow
needs: check

- name: Set up Python 3.10
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: '3.11'

- name: Install Venv
run: |
Expand All @@ -62,6 +62,7 @@ jobs:
- name: Install Spark (via sparklyr)
if: steps.cache-spark.outputs.cache-hit != 'true'
run: |
devtools::install_github("sparklyr/sparklyr")
sparklyr::spark_install(version = Sys.getenv("SPARK_VERSION"))
print(sparklyr::spark_install_find(Sys.getenv("SPARK_VERSION"))$sparkVersionDir)
shell: Rscript {0}
Expand Down
7 changes: 4 additions & 3 deletions .github/workflows/test-coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@ jobs:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
DATABRICKS_HOST: "https://rstudio-partner-posit-default.cloud.databricks.com"
DATABRICKS_CLUSTER_ID: "1026-175310-7cpsh3g8"
DATABRICKS_CLUSTER_ID: "0916-215603-ofitqny9"

steps:
- uses: actions/checkout@v3

- name: Set up Python 3.10
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: '3.11'

- uses: r-lib/actions/setup-r@v2
with:
Expand All @@ -46,6 +46,7 @@ jobs:
- name: Install Spark (via sparklyr)
if: steps.cache-spark.outputs.cache-2-hit != 'true'
run: |
devtools::install_github("sparklyr/sparklyr")
sparklyr::spark_install(version = "3.5.0")
shell: Rscript {0}

Expand Down
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: pysparklyr
Title: Provides a 'PySpark' Back-End for the 'sparklyr' Package
Version: 0.1.5.9000
Version: 0.1.5.9001
Authors@R: c(
person("Edgar", "Ruiz", , "[email protected]", role = c("aut", "cre")),
person(given = "Posit Software, PBC", role = c("cph", "fnd"))
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

* Adds IDE check for positron (#121)

* Avoids failure when an unexpected error from Databricks is returned (#123)

* No longer install 'rpy2' by default. It will prompt user for installation
the first time `spark_apply()` is called (#125)

# pysparklyr 0.1.5

### Improvements
Expand Down
15 changes: 10 additions & 5 deletions R/databricks-utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -206,12 +206,17 @@ databricks_dbr_error <- function(error) {
}

status_tip <- NULL
if (grepl("UNAVAILABLE", status_error)) {
status_tip <- "Possible cause = The cluster is not running, or not accessible"
}
if (grepl("FAILED_PRECONDITION", status_error)) {
status_tip <- "Possible cause = The cluster is initializing. Try again later"
if(!is.null(status_error)){
if (grepl("UNAVAILABLE", status_error)) {
status_tip <- "Possible cause = The cluster is not running, or not accessible"
}
if (grepl("FAILED_PRECONDITION", status_error)) {
status_tip <- "Possible cause = The cluster is initializing. Try again later"
}
} else {
status_error <- error
}

rlang::abort(
c(
"Spark connection error",
Expand Down
3 changes: 1 addition & 2 deletions R/python-install.R
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,7 @@ install_environment <- function(
"PyArrow",
"grpcio",
"google-api-python-client",
"grpcio_status",
"rpy2"
"grpcio_status"
)

if (add_torch && install_ml) {
Expand Down
13 changes: 9 additions & 4 deletions R/sparklyr-spark-apply.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@ spark_apply.tbl_pyspark <- function(
arrow_max_records_per_batch = NULL,
auto_deps = FALSE,
...) {
py_check_installed(
libraries = "rpy2",
msg = "Requires an additional Python library"
)
rpy2_installed()
cli_div(theme = cli_colors())
if (!is.null(packages)) {
cli_abort("`packages` is not yet supported for this backend")
Expand Down Expand Up @@ -209,3 +206,11 @@ sa_function_to_string <- function(
}
ret
}

rpy2_installed <- function(envname = NULL) {
py_check_installed(
envname = envname,
libraries = "rpy2",
msg = "Required 'rpy2' Python library is missing"
)
}
12 changes: 12 additions & 0 deletions tests/testthat/_snaps/databricks-utils.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# DBR error code returns as expected

Spark connection error
* Possible cause = The cluster is not running, or not accessible
* status = StatusCode.UNAVAILABLE
* details = 'RESOURCE_DOES_NOT_EXIST: No cluster found matching: asdfasdf'

---

Spark connection error
*

89 changes: 5 additions & 84 deletions tests/testthat/_snaps/python-install.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
x
Output
$packages
[1] "pyspark==3.5.0" "pandas!=2.1.0"
[3] "PyArrow" "grpcio"
[5] "google-api-python-client" "grpcio_status"
[7] "rpy2" "torch"
[9] "torcheval" "scikit-learn"
[1] "pyspark==3.5.0" "pandas!=2.1.0"
[3] "PyArrow" "grpcio"
[5] "google-api-python-client" "grpcio_status"
[7] "torch" "torcheval"
[9] "scikit-learn"
$envname
unavailable
Expand Down Expand Up @@ -37,7 +37,6 @@
[1] "pyspark==3.5.*" "pandas!=2.1.0"
[3] "PyArrow" "grpcio"
[5] "google-api-python-client" "grpcio_status"
[7] "rpy2"
$envname
unavailable
Expand All @@ -57,81 +56,3 @@
Output
[1] "pysparklyr:::install_environment(a = 1)"

# Databricks installation works

Code
out
Output
$main_library
[1] "databricks-connect"
$spark_method
[1] "databricks_connect"
$backend
[1] "databricks"
$ml_version
[1] "14.1"
$version
[1] "14.1"
$envname
NULL
$python_version
NULL
$new_env
[1] TRUE
$method
[1] "auto" "virtualenv" "conda"
$as_job
[1] TRUE
$install_ml
[1] FALSE

---

Code
install_databricks(version = "13.1")
Output
$main_library
[1] "databricks-connect"
$spark_method
[1] "databricks_connect"
$backend
[1] "databricks"
$ml_version
[1] "14.1"
$version
[1] "13.1"
$envname
NULL
$python_version
NULL
$new_env
[1] TRUE
$method
[1] "auto" "virtualenv" "conda"
$as_job
[1] TRUE
$install_ml
[1] FALSE

8 changes: 5 additions & 3 deletions tests/testthat/test-databricks-utils.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
skip_if_not_databricks()

test_that("DBR error code returns as expected", {
error <- paste0(
"SparkConnectGrpcException('<_InactiveRpcError of RPC that terminated with:",
Expand All @@ -10,9 +8,13 @@ test_that("DBR error code returns as expected", {
" created_time:'2023-10-02T12:14:52.379226-05:00'}'\n>')"
)

expect_error(databricks_dbr_error(error))
expect_snapshot_error(databricks_dbr_error(error))

expect_snapshot_error(databricks_dbr_error(""))
})

skip_if_not_databricks()

test_that("Databricks Host works", {
expect_true(nchar(databricks_host()) > 5)

Expand Down
1 change: 1 addition & 0 deletions tests/testthat/test-sparklyr-spark-apply.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
test_that("spark_apply() works", {
py_install("rpy2")
tbl_mtcars <- use_test_table_mtcars()
expect_s3_class(
spark_apply(tbl_mtcars, nrow, group_by = "am", columns = "am double, x long"),
Expand Down

0 comments on commit 6ea4aab

Please sign in to comment.