Merge pull request #13 from gaborcsardi/upkeep/tests

Test round-trip w/ arrow, duckdb
r-lib · May 2, 2024 · 5354d47 · 5354d47
2 parents f7b417b + 0ffbe9b
commit 5354d47
Show file tree

Hide file tree

Showing 9 changed files with 219 additions and 3 deletions.
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -50,11 +50,34 @@ jobs:
  http-user-agent: ${{ matrix.config.http-user-agent }}
  use-public-rspm: true
 
+ - name: Install Python packages for testing
+ run: |
+ dir.create("~/.pip", showWarnings = FALSE, recursive = TRUE)
+ writeLines(
+ c("[global]", "break-system-packages = true", "user = true"),
+ "~/.pip/pip.conf"
+ )
+ if (Sys.which("pip3") != "") {
+ system("pip3 install pyarrow pandas")
+ } else {
+ system("pip install pyarrow pandas")
+ }
+ shell: Rscript {0}
+
  - uses: r-lib/actions/setup-r-dependencies@v2
  with:
  extra-packages: any::rcmdcheck
  needs: check
 
+ - name: Update arrow on macOS, CRAN binary is broken
+ if: runner.os == 'macOS'
+ run: |
+ if (! arrow::arrow_info()$capabilities[["parquet"]]) {
+ pak::repo_add(CRAN = "https://apache.r-universe.dev")
+ pak::pkg_install("arrow?reinstall&nocache")
+ }
+ shell: Rscript {0}
+
  - uses: r-lib/actions/check-r-package@v2
  with:
  upload-snapshots: true

diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
@@ -21,6 +21,20 @@ jobs:
  with:
  use-public-rspm: true
 
+ - name: Install Python packages for testing
+ run: |
+ dir.create("~/.pip", showWarnings = FALSE, recursive = TRUE)
+ writeLines(
+ c("[global]", "break-system-packages = true", "user = true"),
+ "~/.pip/pip.conf"
+ )
+ if (Sys.which("pip3") != "") {
+ system("pip3 install pyarrow pandas")
+ } else {
+ system("pip install pyarrow pandas")
+ }
+ shell: Rscript {0}
+
  - uses: r-lib/actions/setup-r-dependencies@v2
  with:
  extra-packages: any::covr

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -13,9 +13,15 @@ Description: Self-sufficient reader for a subset of Parquet files.
  Nested tables, compression besides Snappy and encryption are not
  supported.
 Depends:
- R (>= 3.5.0)
+ R (>= 4.0.0)
 License: MIT + file LICENSE
 Encoding: UTF-8
-Suggests: testthat
+Suggests:
+ arrow,
+ DBI,
+ duckdb,
+ processx,
+ testthat
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.1
+Config/testthat/edition: 3
diff --git a/R/miniparquet.R b/R/miniparquet.R
@@ -38,6 +38,7 @@ read_parquet <- parquet_read
 #' # add row names as a column, because `parquet_write()` ignores them.
 #' mtcars2 <- cbind(name = rownames(mtcars), mtcars)
 #' parquet_write(mtcars2, "mtcars.parquet")
+#' \dontshow{if (Sys.getenv("NOT_CRAN") == "true") unlink("mtcars.parquet")}
 
 parquet_write <- function(
  x,

diff --git a/man/parquet_write.Rd b/man/parquet_write.Rd
diff --git a/tests/testthat/_snaps/write-parquet.md b/tests/testthat/_snaps/write-parquet.md
@@ -0,0 +1,53 @@
+# round trip with pandas/pyarrow
+
+ Code
+  writeLines(pyout$stdout)
+ Output
+  nam mpg cyl disp hp drat wt qsec vs am gear carb
+  0 Mazda RX4 21.0 6 160.0 110.0 3.90 2.620 16.46 0.0 1.0 4.0 4.0
+  1 Mazda RX4 Wag 21.0 6 160.0 110.0 3.90 2.875 17.02 0.0 1.0 4.0 4.0
+  2 Datsun 710 22.8 4 108.0 93.0 3.85 2.320 18.61 1.0 1.0 4.0 1.0
+  3 Hornet 4 Drive 21.4 6 258.0 110.0 3.08 3.215 19.44 1.0 0.0 3.0 1.0
+  4 Hornet Sportabout 18.7 8 360.0 175.0 3.15 3.440 17.02 0.0 0.0 3.0 2.0
+  5 Valiant 18.1 6 225.0 105.0 2.76 3.460 20.22 1.0 0.0 3.0 1.0
+  6 Duster 360 14.3 8 360.0 245.0 3.21 3.570 15.84 0.0 0.0 3.0 4.0
+  7 Merc 240D 24.4 4 146.7 62.0 3.69 3.190 20.00 1.0 0.0 4.0 2.0
+  8 Merc 230 22.8 4 140.8 95.0 3.92 3.150 22.90 1.0 0.0 4.0 2.0
+  9 Merc 280 19.2 6 167.6 123.0 3.92 3.440 18.30 1.0 0.0 4.0 4.0
+  10 Merc 280C 17.8 6 167.6 123.0 3.92 3.440 18.90 1.0 0.0 4.0 4.0
+  11 Merc 450SE 16.4 8 275.8 180.0 3.07 4.070 17.40 0.0 0.0 3.0 3.0
+  12 Merc 450SL 17.3 8 275.8 180.0 3.07 3.730 17.60 0.0 0.0 3.0 3.0
+  13 Merc 450SLC 15.2 8 275.8 180.0 3.07 3.780 18.00 0.0 0.0 3.0 3.0
+  14 Cadillac Fleetwood 10.4 8 472.0 205.0 2.93 5.250 17.98 0.0 0.0 3.0 4.0
+  15 Lincoln Continental 10.4 8 460.0 215.0 3.00 5.424 17.82 0.0 0.0 3.0 4.0
+  16 Chrysler Imperial 14.7 8 440.0 230.0 3.23 5.345 17.42 0.0 0.0 3.0 4.0
+  17 Fiat 128 32.4 4 78.7 66.0 4.08 2.200 19.47 1.0 1.0 4.0 1.0
+  18 Honda Civic 30.4 4 75.7 52.0 4.93 1.615 18.52 1.0 1.0 4.0 2.0
+  19 Toyota Corolla 33.9 4 71.1 65.0 4.22 1.835 19.90 1.0 1.0 4.0 1.0
+  20 Toyota Corona 21.5 4 120.1 97.0 3.70 2.465 20.01 1.0 0.0 3.0 1.0
+  21 Dodge Challenger 15.5 8 318.0 150.0 2.76 3.520 16.87 0.0 0.0 3.0 2.0
+  22 AMC Javelin 15.2 8 304.0 150.0 3.15 3.435 17.30 0.0 0.0 3.0 2.0
+  23 Camaro Z28 13.3 8 350.0 245.0 3.73 3.840 15.41 0.0 0.0 3.0 4.0
+  24 Pontiac Firebird 19.2 8 400.0 175.0 3.08 3.845 17.05 0.0 0.0 3.0 2.0
+  25 Fiat X1-9 27.3 4 79.0 66.0 4.08 1.935 18.90 1.0 1.0 4.0 1.0
+  26 Porsche 914-2 26.0 4 120.3 91.0 4.43 2.140 16.70 0.0 1.0 5.0 2.0
+  27 Lotus Europa 30.4 4 95.1 113.0 3.77 1.513 16.90 1.0 1.0 5.0 2.0
+  28 Ford Pantera L 15.8 8 351.0 264.0 4.22 3.170 14.50 0.0 1.0 5.0 4.0
+  29 Ferrari Dino 19.7 6 145.0 175.0 3.62 2.770 15.50 0.0 1.0 5.0 6.0
+  30 Maserati Bora 15.0 8 301.0 335.0 3.54 3.570 14.60 0.0 1.0 5.0 8.0
+  31 Volvo 142E 21.4 4 121.0 109.0 4.11 2.780 18.60 1.0 1.0 4.0 2.0
+  nam object
+  mpg float64
+  cyl int32
+  disp float64
+  hp float64
+  drat float64
+  wt float64
+  qsec float64
+  vs float64
+  am float64
+  gear float64
+  carb float64
+  dtype: object
+  
+
diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R
@@ -0,0 +1,9 @@
+test_df <- function(tibble = FALSE) {
+ df <- cbind(nam = rownames(mtcars), mtcars)
+ df$cyl <- as.integer(df$cyl)
+ rownames(df) <- NULL
+ if (tibble) {
+ class(df) <- c("tbl_df", "tbl", "data.frame")
+ }
+ df
+}
diff --git a/tests/testthat/test-read-parquet.R b/tests/testthat/test-read-parquet.R
@@ -91,3 +91,44 @@ test_that("basic reading works with snappy", {
  res <- parquet_read(test_path("data/alltypes_plain.snappy.parquet"))
  expect_true(data_comparable(alltypes_plain_snappy, res))
 })
+
+test_that("round trip with arrow", {
+ # Don't want to skip on the parquet capability missing, because then
+ # this might not be tested on the CI. So rather we skip on CRAN.
+ skip_on_cran()
+ mt <- test_df()
+ tmp <- tempfile(fileext = ".parquet")
+ on.exit(unlink(tmp), add = TRUE)
+
+ arrow::write_parquet(mt, tmp, compression = "uncompressed")
+ expect_equal(read_parquet(tmp), mt)
+ unlink(tmp)
+
+ arrow::write_parquet(mt, tmp, compression = "snappy")
+ expect_equal(read_parquet(tmp), mt)
+})
+
+test_that("round trip with duckdb", {
+ mt <- test_df()
+ tmp <- tempfile(fileext = ".parquet")
+ on.exit(unlink(tmp), add = TRUE)
+
+ drv <- duckdb::duckdb()
+ con <- DBI::dbConnect(drv)
+ on.exit(DBI::dbDisconnect(con), add = TRUE)
+ DBI::dbWriteTable(con, "mtcars", mt)
+
+ DBI::dbExecute(con, DBI::sqlInterpolate(con,
+ "COPY mtcars TO ?filename (FORMAT 'parquet', COMPRESSION 'uncompressed')",
+ filename = tmp
+ ))
+ expect_equal(read_parquet(tmp), mt)
+ unlink(tmp)
+
+ DBI::dbExecute(con, DBI::sqlInterpolate(con,
+ "COPY mtcars TO ?filename (FORMAT PARQUET, COMPRESSION 'snappy')",
+ filename = tmp
+ ))
+ arrow::write_parquet(mt, tmp, compression = "snappy")
+ expect_equal(read_parquet(tmp), mt)
+})
diff --git a/tests/testthat/test-write-parquet.R b/tests/testthat/test-write-parquet.R
@@ -1,5 +1,5 @@
 test_that("round trip", {
- mt <- cbind(nam = rownames(mtcars), mtcars)
+ mt <- test_df()
  tmp <- tempfile(fileext = ".parquet")
  on.exit(unlink(tmp), add = TRUE)
 
@@ -10,3 +10,71 @@ test_that("round trip", {
  write_parquet(mt, tmp, compression = "snappy")
  expect_true(all(read_parquet(tmp) == mt))
 })
+
+test_that("round trip with arrow", {
+ # Don't want to skip on the parquet capability missing, because then
+ # this might not be tested on the CI. So rather we skip on CRAN.
+ skip_on_cran()
+ mt <- test_df(tibble = TRUE)
+ tmp <- tempfile(fileext = ".parquet")
+ on.exit(unlink(tmp), add = TRUE)
+
+ write_parquet(mt, tmp, compression = "uncompressed")
+ expect_equal(arrow::read_parquet(tmp), mt)
+ unlink(tmp)
+
+ write_parquet(mt, tmp, compression = "snappy")
+ expect_equal(arrow::read_parquet(tmp), mt)
+})
+
+test_that("round trip with duckdb", {
+ mt <- test_df()
+ tmp <- tempfile(fileext = ".parquet")
+ on.exit(unlink(tmp), add = TRUE)
+
+ write_parquet(mt, tmp, compression = "uncompressed")
+ df <- duckdb:::sql(sprintf("FROM '%s'", tmp))
+ expect_equal(df, mt)
+ unlink(tmp)
+
+ write_parquet(mt, tmp, compression = "snappy")
+ df <- duckdb:::sql(sprintf("FROM '%s'", tmp))
+ expect_equal(df, mt)
+})
+
+test_that("round trip with pandas/pyarrow", {
+ skip_on_cran()
+ mt <- test_df()
+ tmp1 <- tempfile(fileext = ".parquet")
+ tmp2 <- tempfile(fileext = ".parquet")
+ on.exit(unlink(c(tmp1, tmp2)), add = TRUE)
+ # need to create to be able to call normalizePath()
+ file.create(tmp1)
+ file.create(tmp2)
+
+ py_read <- function(input, output) {
+ pyscript <- sprintf(r"[
+import pyarrow
+import pandas
+pandas.set_option("display.width", 150)
+pandas.set_option("display.max_columns", 150)
+pandas.set_option("display.max_colwidth", 150)
+df = pandas.read_parquet("%s", engine = "pyarrow")
+print(df)
+print(df.dtypes)
+df.to_parquet("%s", engine = "pyarrow")
+]", normalizePath(input, winslash = "/"), normalizePath(output, winslash = "/"))
+ pytmp <- tempfile(fileext = ".py")
+ on.exit(unlink(pytmp), add = TRUE)
+ writeLines(pyscript, pytmp)
+ py <- if (Sys.which("python3") != "") "python3" else "python"
+ processx::run(py, pytmp, stderr = "2>&1", error_on_status = FALSE)
+ }
+
+ write_parquet(mt, tmp1, compression = "uncompressed")
+ pyout <- py_read(tmp1, tmp2)
+ expect_snapshot(writeLines(pyout$stdout))
+
+ mt2 <- read_parquet(tmp2)
+ expect_equal(mt2, mt)
+})