Skip to content

Commit

Permalink
Row group related tests
Browse files Browse the repository at this point in the history
  • Loading branch information
gaborcsardi committed Sep 13, 2024
1 parent 6cbcd8c commit aa4cfd6
Show file tree
Hide file tree
Showing 4 changed files with 190 additions and 0 deletions.
28 changes: 28 additions & 0 deletions tests/testthat/_snaps/utils.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# as_count

Code
as_count(1:2)
Condition
Error in `as_count()`:
! x must be a count, i.e. an integer scalar
Code
as_count(0)
Condition
Error in `as_count()`:
! x must be a count, i.e. an integer scalar
Code
as_count(NA_real_)
Condition
Error in `as_count()`:
! x must be a count, i.e. an integer scalar
Code
as_count(-100)
Condition
Error in `as_count()`:
! x must be a count, i.e. an integer scalar
Code
as_count(-100L)
Condition
Error in `as_count()`:
! x must be a count, i.e. an integer scalar

72 changes: 72 additions & 0 deletions tests/testthat/_snaps/write-parquet-row-groups.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# errors

Code
parquet_options(num_rows_per_row_group = "foobar")
Condition
Error in `as_count()`:
! num_rows_per_row_group must be a count, i.e. an integer scalar

---

Code
write_parquet(df, tmp, row_groups = "foobar")
Condition
Error in `parse_row_groups()`:
! Row groups must be specified as a growing positive integer vector, starting with 1.
Code
write_parquet(df, tmp, row_groups = c(100L, 1L))
Condition
Error in `parse_row_groups()`:
! Row groups must be specified as a growing positive integer vector, starting with 1.
Code
write_parquet(df, tmp, row_groups = c(1L, 100L))
Condition
Error in `write_parquet()`:
! Internal nanoparquet error, row index too large

# grouped df

Code
write_parquet(df, tmp)
Message
Ordering data frame according to row groups.

---

Code
as.data.frame(read_parquet(tmp)[, c("nam", "cyl")])
Output
nam cyl
1 Datsun 710 4
2 Merc 240D 4
3 Merc 230 4
4 Fiat 128 4
5 Honda Civic 4
6 Toyota Corolla 4
7 Toyota Corona 4
8 Fiat X1-9 4
9 Porsche 914-2 4
10 Lotus Europa 4
11 Volvo 142E 4
12 Mazda RX4 6
13 Mazda RX4 Wag 6
14 Hornet 4 Drive 6
15 Valiant 6
16 Merc 280 6
17 Merc 280C 6
18 Ferrari Dino 6
19 Hornet Sportabout 8
20 Duster 360 8
21 Merc 450SE 8
22 Merc 450SL 8
23 Merc 450SLC 8
24 Cadillac Fleetwood 8
25 Lincoln Continental 8
26 Chrysler Imperial 8
27 Dodge Challenger 8
28 AMC Javelin 8
29 Camaro Z28 8
30 Pontiac Firebird 8
31 Ford Pantera L 8
32 Maserati Bora 8

36 changes: 36 additions & 0 deletions tests/testthat/test-utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,39 @@ test_that("is_uint32", {
expect_false(is_uint32(NA_real_))
expect_false(is_uint32("foo"))
})

test_that("is_icount", {
expect_true(is_icount(1L))
expect_true(is_icount(100L))
expect_true(is_icount(2147483647L))

expect_false(is_icount(NA_integer_))
expect_false(is_icount(1:2))
expect_false(is_icount(1))
expect_false(is_icount(0L))
expect_false(is_icount(-100L))
})

test_that("is_dcount", {
expect_true(is_dcount(1))
expect_true(is_dcount(100))
expect_true(is_dcount(2147483647))

expect_false(is_dcount(NA_real_))
expect_false(is_dcount(1:2))
expect_false(is_dcount(1L))
expect_false(is_dcount(0))
expect_false(is_dcount(-100))
})

test_that("as_count", {
expect_equal(as_count(1), 1L)
expect_equal(as_count(100), 100L)
expect_snapshot(error = TRUE, {
as_count(1:2)
as_count(0)
as_count(NA_real_)
as_count(-100)
as_count(-100L)
})
})
54 changes: 54 additions & 0 deletions tests/testthat/test-write-parquet-row-groups.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
test_that("errors", {
expect_snapshot(error = TRUE, {
parquet_options(num_rows_per_row_group = "foobar")
})

df <- test_df()
tmp <- tempfile(fileext = ".parquet")
on.exit(unlink(tmp), add = TRUE)
expect_snapshot(error = TRUE, {
write_parquet(df, tmp, row_groups = "foobar")
write_parquet(df, tmp, row_groups = c(100L, 1L))
write_parquet(df, tmp, row_groups = c(1L, 100L))
})
})

test_that("row groups", {
tmp1 <- tempfile(fileext = ".parquet")
tmp2 <- tempfile(fileext = ".parquet")
on.exit(unlink(c(tmp1, tmp2)), add = TRUE)

df <- test_df()
write_parquet(df, tmp1, row_groups = 1L)
write_parquet(df, tmp2, row_groups = c(1L, 16L))
expect_equal(read_parquet(tmp1), read_parquet(tmp2))
expect_equal(nrow(read_parquet_metadata(tmp2)[["row_groups"]]), 2L)

unlink(tmp2)
write_parquet(df, tmp2, row_groups = seq_len(nrow(df)))
expect_equal(read_parquet(tmp1), read_parquet(tmp2))
expect_equal(nrow(read_parquet_metadata(tmp2)[["row_groups"]]), nrow(df))

unlink(tmp2)
withr::local_options(nanoparquet.num_rows_per_row_group = 10L)
write_parquet(df, tmp2)
expect_equal(read_parquet(tmp1), read_parquet(tmp2))
expect_equal(nrow(read_parquet_metadata(tmp2)[["row_groups"]]), 4L)
})

test_that("grouped df", {
df <- test_df()
attr(df, "groups") <- data.frame(
cyl = c(4L, 6L, 8L),
.rows = I(list(
c(3L, 8L, 9L, 18L, 19L, 20L, 21L, 26L, 27L, 28L, 32L),
c(1L, 2L, 4L, 6L, 10L, 11L, 30L),
c(5L, 7L, 12L, 13L, 14L, 15L, 16L, 17L, 22L, 23L, 24L, 25L, 29L, 31L)
))
)

tmp <- tempfile(fileext = ".parquet")
expect_snapshot(write_parquet(df, tmp))
expect_equal(nrow(read_parquet_metadata(tmp)[["row_groups"]]), 3L)
expect_snapshot(as.data.frame(read_parquet(tmp)[, c("nam", "cyl")]))
})

0 comments on commit aa4cfd6

Please sign in to comment.