Row group related tests

r-lib · Sep 13, 2024 · aa4cfd6 · aa4cfd6
1 parent 6cbcd8c
commit aa4cfd6
Show file tree

Hide file tree

Showing 4 changed files with 190 additions and 0 deletions.
diff --git a/tests/testthat/_snaps/utils.md b/tests/testthat/_snaps/utils.md
@@ -0,0 +1,28 @@
+# as_count
+
+ Code
+  as_count(1:2)
+ Condition
+  Error in `as_count()`:
+  ! x must be a count, i.e. an integer scalar
+ Code
+  as_count(0)
+ Condition
+  Error in `as_count()`:
+  ! x must be a count, i.e. an integer scalar
+ Code
+  as_count(NA_real_)
+ Condition
+  Error in `as_count()`:
+  ! x must be a count, i.e. an integer scalar
+ Code
+  as_count(-100)
+ Condition
+  Error in `as_count()`:
+  ! x must be a count, i.e. an integer scalar
+ Code
+  as_count(-100L)
+ Condition
+  Error in `as_count()`:
+  ! x must be a count, i.e. an integer scalar
+
diff --git a/tests/testthat/_snaps/write-parquet-row-groups.md b/tests/testthat/_snaps/write-parquet-row-groups.md
@@ -0,0 +1,72 @@
+# errors
+
+ Code
+  parquet_options(num_rows_per_row_group = "foobar")
+ Condition
+  Error in `as_count()`:
+  ! num_rows_per_row_group must be a count, i.e. an integer scalar
+
+---
+
+ Code
+  write_parquet(df, tmp, row_groups = "foobar")
+ Condition
+  Error in `parse_row_groups()`:
+  ! Row groups must be specified as a growing positive integer vector, starting with 1.
+ Code
+  write_parquet(df, tmp, row_groups = c(100L, 1L))
+ Condition
+  Error in `parse_row_groups()`:
+  ! Row groups must be specified as a growing positive integer vector, starting with 1.
+ Code
+  write_parquet(df, tmp, row_groups = c(1L, 100L))
+ Condition
+  Error in `write_parquet()`:
+  ! Internal nanoparquet error, row index too large
+
+# grouped df
+
+ Code
+  write_parquet(df, tmp)
+ Message
+  Ordering data frame according to row groups.
+
+---
+
+ Code
+  as.data.frame(read_parquet(tmp)[, c("nam", "cyl")])
+ Output
+  nam cyl
+  1 Datsun 710 4
+  2 Merc 240D 4
+  3 Merc 230 4
+  4 Fiat 128 4
+  5 Honda Civic 4
+  6 Toyota Corolla 4
+  7 Toyota Corona 4
+  8 Fiat X1-9 4
+  9 Porsche 914-2 4
+  10 Lotus Europa 4
+  11 Volvo 142E 4
+  12 Mazda RX4 6
+  13 Mazda RX4 Wag 6
+  14 Hornet 4 Drive 6
+  15 Valiant 6
+  16 Merc 280 6
+  17 Merc 280C 6
+  18 Ferrari Dino 6
+  19 Hornet Sportabout 8
+  20 Duster 360 8
+  21 Merc 450SE 8
+  22 Merc 450SL 8
+  23 Merc 450SLC 8
+  24 Cadillac Fleetwood 8
+  25 Lincoln Continental 8
+  26 Chrysler Imperial 8
+  27 Dodge Challenger 8
+  28 AMC Javelin 8
+  29 Camaro Z28 8
+  30 Pontiac Firebird 8
+  31 Ford Pantera L 8
+  32 Maserati Bora 8
+
diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R
@@ -51,3 +51,39 @@ test_that("is_uint32", {
  expect_false(is_uint32(NA_real_))
  expect_false(is_uint32("foo"))
 })
+
+test_that("is_icount", {
+ expect_true(is_icount(1L))
+ expect_true(is_icount(100L))
+ expect_true(is_icount(2147483647L))
+
+ expect_false(is_icount(NA_integer_))
+ expect_false(is_icount(1:2))
+ expect_false(is_icount(1))
+ expect_false(is_icount(0L))
+ expect_false(is_icount(-100L))
+})
+
+test_that("is_dcount", {
+ expect_true(is_dcount(1))
+ expect_true(is_dcount(100))
+ expect_true(is_dcount(2147483647))
+
+ expect_false(is_dcount(NA_real_))
+ expect_false(is_dcount(1:2))
+ expect_false(is_dcount(1L))
+ expect_false(is_dcount(0))
+ expect_false(is_dcount(-100))
+})
+
+test_that("as_count", {
+ expect_equal(as_count(1), 1L)
+ expect_equal(as_count(100), 100L)
+ expect_snapshot(error = TRUE, {
+ as_count(1:2)
+ as_count(0)
+ as_count(NA_real_)
+ as_count(-100)
+ as_count(-100L)
+ })
+})
diff --git a/tests/testthat/test-write-parquet-row-groups.R b/tests/testthat/test-write-parquet-row-groups.R
@@ -0,0 +1,54 @@
+test_that("errors", {
+ expect_snapshot(error = TRUE, {
+ parquet_options(num_rows_per_row_group = "foobar")
+ })
+
+ df <- test_df()
+ tmp <- tempfile(fileext = ".parquet")
+ on.exit(unlink(tmp), add = TRUE)
+ expect_snapshot(error = TRUE, {
+ write_parquet(df, tmp, row_groups = "foobar")
+ write_parquet(df, tmp, row_groups = c(100L, 1L))
+ write_parquet(df, tmp, row_groups = c(1L, 100L))
+ })
+})
+
+test_that("row groups", {
+ tmp1 <- tempfile(fileext = ".parquet")
+ tmp2 <- tempfile(fileext = ".parquet")
+ on.exit(unlink(c(tmp1, tmp2)), add = TRUE)
+
+ df <- test_df()
+ write_parquet(df, tmp1, row_groups = 1L)
+ write_parquet(df, tmp2, row_groups = c(1L, 16L))
+ expect_equal(read_parquet(tmp1), read_parquet(tmp2))
+ expect_equal(nrow(read_parquet_metadata(tmp2)[["row_groups"]]), 2L)
+
+ unlink(tmp2)
+ write_parquet(df, tmp2, row_groups = seq_len(nrow(df)))
+ expect_equal(read_parquet(tmp1), read_parquet(tmp2))
+ expect_equal(nrow(read_parquet_metadata(tmp2)[["row_groups"]]), nrow(df))
+
+ unlink(tmp2)
+ withr::local_options(nanoparquet.num_rows_per_row_group = 10L)
+ write_parquet(df, tmp2)
+ expect_equal(read_parquet(tmp1), read_parquet(tmp2))
+ expect_equal(nrow(read_parquet_metadata(tmp2)[["row_groups"]]), 4L)
+})
+
+test_that("grouped df", {
+ df <- test_df()
+ attr(df, "groups") <- data.frame(
+ cyl = c(4L, 6L, 8L),
+ .rows = I(list(
+ c(3L, 8L, 9L, 18L, 19L, 20L, 21L, 26L, 27L, 28L, 32L),
+ c(1L, 2L, 4L, 6L, 10L, 11L, 30L),
+ c(5L, 7L, 12L, 13L, 14L, 15L, 16L, 17L, 22L, 23L, 24L, 25L, 29L, 31L)
+ ))
+ )
+
+ tmp <- tempfile(fileext = ".parquet")
+ expect_snapshot(write_parquet(df, tmp))
+ expect_equal(nrow(read_parquet_metadata(tmp)[["row_groups"]]), 3L)
+ expect_snapshot(as.data.frame(read_parquet(tmp)[, c("nam", "cyl")]))
+})