From 187197c369058f7d1377c1b161c469a9e4542caf Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Sat, 27 Jul 2024 11:53:05 -0500 Subject: [PATCH] GH-43349: [R] Fix altrep string columns from readr (#43351) ### Rationale for this change To resolve the reverse dependency issue with `parquetize` ### What changes are included in this PR? One step towards resolving the issue ### Are these changes tested? yes ### Are there any user-facing changes? no * GitHub Issue: #43349 Authored-by: Jonathan Keane Signed-off-by: Jonathan Keane --- r/src/arrow_cpp11.h | 11 ++++++++++- r/tests/testthat/test-csv.R | 17 +++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/r/src/arrow_cpp11.h b/r/src/arrow_cpp11.h index b2ed66b83c3d1..073b577d63ade 100644 --- a/r/src/arrow_cpp11.h +++ b/r/src/arrow_cpp11.h @@ -138,7 +138,13 @@ inline R_xlen_t r_string_size(SEXP s) { } // namespace unsafe inline SEXP utf8_strings(SEXP x) { - return cpp11::unwind_protect([x] { + return cpp11::unwind_protect([&] { + // ensure that x is not actually altrep first this also ensures that + // x is not altrep even after it is materialized + bool was_altrep = ALTREP(x); + if (was_altrep) { + x = PROTECT(Rf_duplicate(x)); + } R_xlen_t n = XLENGTH(x); // if `x` is an altrep of some sort, this will @@ -152,6 +158,9 @@ inline SEXP utf8_strings(SEXP x) { SET_STRING_ELT(x, i, Rf_mkCharCE(Rf_translateCharUTF8(s), CE_UTF8)); } } + if (was_altrep) { + UNPROTECT(1); + } return x; }); } diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R index 36f1f229a6085..a6291ebda09cc 100644 --- a/r/tests/testthat/test-csv.R +++ b/r/tests/testthat/test-csv.R @@ -738,5 +738,22 @@ test_that("read_csv2_arrow correctly parses comma decimals", { tf <- tempfile() writeLines("x;y\n1,2;c", con = tf) expect_equal(read_csv2_arrow(tf), tibble(x = 1.2, y = "c")) +}) + +test_that("altrep columns can roundtrip to table", { + tf <- tempfile() + on.exit(unlink(tf)) + write.csv(tbl, tf, row.names = FALSE) + + # read in, some columns will be altrep by default + new_df <- read_csv_arrow(tf) + expect_equal(tbl, as_tibble(arrow_table(new_df))) + + # but also if we materialize the vector + # this could also be accomplished with printing + new_df <- read_csv_arrow(tf) + test_arrow_altrep_force_materialize(new_df$chr) + # we should still be able to turn this into a table + expect_equal(tbl, as_tibble(arrow_table(new_df))) })