diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a738a6ff5f1..6bd2787d6dc 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -79,7 +79,7 @@ jobs: run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yml@branch-23.10 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -88,7 +88,7 @@ jobs: script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yml@branch-23.10 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10 with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: nightly diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index 261f736291f..fff5bf1e840 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -16,7 +16,7 @@ source: build: number: {{ GIT_DESCRIBE_NUMBER }} - string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} script_env: - AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 043a7322aea..6f74575f518 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -388,7 +388,7 @@ outputs: script: install_libcudf_example.sh build: number: {{ GIT_DESCRIBE_NUMBER }} - string: {{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} ignore_run_exports_from: {% if cuda_major == "11" %} - {{ compiler('cuda11') }} diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index 4469ec59b7a..af53a09e4db 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -871,45 +871,65 @@ inline __device__ uint32_t InitLevelSection(page_state_s* s, level_type lvl) { int32_t len; - int level_bits = s->col.level_bits[lvl]; - Encoding encoding = lvl == level_type::DEFINITION ? s->page.definition_level_encoding - : s->page.repetition_level_encoding; + int const level_bits = s->col.level_bits[lvl]; + auto const encoding = lvl == level_type::DEFINITION ? s->page.definition_level_encoding + : s->page.repetition_level_encoding; auto start = cur; - if (level_bits == 0) { - len = 0; - s->initial_rle_run[lvl] = s->page.num_input_values * 2; // repeated value - s->initial_rle_value[lvl] = 0; - s->lvl_start[lvl] = cur; - s->abs_lvl_start[lvl] = cur; - } else if (encoding == Encoding::RLE) { - // V2 only uses RLE encoding, so only perform check here - if (s->page.def_lvl_bytes || s->page.rep_lvl_bytes) { - len = lvl == level_type::DEFINITION ? s->page.def_lvl_bytes : s->page.rep_lvl_bytes; - } else if (cur + 4 < end) { - len = 4 + (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24); - cur += 4; - } else { - len = 0; - s->error = 2; - } - s->abs_lvl_start[lvl] = cur; - if (!s->error) { - uint32_t run = get_vlq32(cur, end); - s->initial_rle_run[lvl] = run; - if (!(run & 1)) { - int v = (cur < end) ? cur[0] : 0; + + auto init_rle = [s, lvl, end, level_bits](uint8_t const* cur, uint8_t const* end) { + uint32_t const run = get_vlq32(cur, end); + s->initial_rle_run[lvl] = run; + if (!(run & 1)) { + if (cur < end) { + int v = cur[0]; cur++; if (level_bits > 8) { v |= ((cur < end) ? cur[0] : 0) << 8; cur++; } s->initial_rle_value[lvl] = v; + } else { + s->initial_rle_value[lvl] = 0; } - s->lvl_start[lvl] = cur; } + s->lvl_start[lvl] = cur; if (cur > end) { s->error = 2; } + }; + + // this is a little redundant. if level_bits == 0, then nothing should be encoded + // for the level, but some V2 files in the wild violate this and encode the data anyway. + // thus we will handle V2 headers separately. + if ((s->page.flags & PAGEINFO_FLAGS_V2) != 0) { + // V2 only uses RLE encoding so no need to check encoding + len = lvl == level_type::DEFINITION ? s->page.def_lvl_bytes : s->page.rep_lvl_bytes; + s->abs_lvl_start[lvl] = cur; + if (len == 0) { + s->initial_rle_run[lvl] = s->page.num_input_values * 2; // repeated value + s->initial_rle_value[lvl] = 0; + s->lvl_start[lvl] = cur; + } else { + init_rle(cur, cur + len); + } + } else if (level_bits == 0) { + len = 0; + s->initial_rle_run[lvl] = s->page.num_input_values * 2; // repeated value + s->initial_rle_value[lvl] = 0; + s->lvl_start[lvl] = cur; + s->abs_lvl_start[lvl] = cur; + } else if (encoding == Encoding::RLE) { // V1 header with RLE encoding + if (cur + 4 < end) { + len = (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24); + cur += 4; + s->abs_lvl_start[lvl] = cur; + init_rle(cur, cur + len); + // add back the 4 bytes for the length + len += 4; + } else { + len = 0; + s->error = 2; + } } else if (encoding == Encoding::BIT_PACKED) { len = (s->page.num_input_values * level_bits + 7) >> 3; s->initial_rle_run[lvl] = ((s->page.num_input_values + 7) >> 3) * 2 + 1; // literal run @@ -1247,7 +1267,13 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, s->dict_val = 0; if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; } break; - case Encoding::RLE: s->dict_run = 0; break; + case Encoding::RLE: { + // first 4 bytes are length of RLE data + int const len = (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24); + cur += 4; + if (cur + len > end) { s->error = 2; } + s->dict_run = 0; + } break; default: s->error = 1; // Unsupported encoding break; diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 16886d91fc9..e5dd029fde2 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -386,6 +386,7 @@ __global__ void __launch_bounds__(128) // definition levels bs->page.chunk_row += bs->page.num_rows; bs->page.num_rows = 0; + bs->page.flags = 0; // zero out V2 info bs->page.num_nulls = 0; bs->page.def_lvl_bytes = 0; @@ -395,7 +396,6 @@ __global__ void __launch_bounds__(128) case PageType::DATA_PAGE: index_out = num_dict_pages + data_page_count; data_page_count++; - bs->page.flags = 0; // this computation is only valid for flat schemas. for nested schemas, // they will be recomputed in the preprocess step by examining repetition and // definition levels @@ -405,7 +405,7 @@ __global__ void __launch_bounds__(128) case PageType::DATA_PAGE_V2: index_out = num_dict_pages + data_page_count; data_page_count++; - bs->page.flags = 0; + bs->page.flags |= PAGEINFO_FLAGS_V2; values_found += bs->page.num_input_values; // V2 only uses RLE, so it was removed from the header bs->page.definition_level_encoding = Encoding::RLE; @@ -414,7 +414,7 @@ __global__ void __launch_bounds__(128) case PageType::DICTIONARY_PAGE: index_out = dictionary_page_count; dictionary_page_count++; - bs->page.flags = PAGEINFO_FLAGS_DICTIONARY; + bs->page.flags |= PAGEINFO_FLAGS_DICTIONARY; break; default: index_out = -1; break; } diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 25d2885b7da..97c71de9a9b 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -74,6 +74,7 @@ namespace gpu { */ enum { PAGEINFO_FLAGS_DICTIONARY = (1 << 0), // Indicates a dictionary page + PAGEINFO_FLAGS_V2 = (1 << 1), // V2 page header }; /** diff --git a/python/cudf/cudf/tests/data/parquet/rle_boolean_encoding.parquet b/python/cudf/cudf/tests/data/parquet/rle_boolean_encoding.parquet new file mode 100644 index 00000000000..6a6de0a9422 Binary files /dev/null and b/python/cudf/cudf/tests/data/parquet/rle_boolean_encoding.parquet differ diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index f403c522f58..ff4c2e2a14d 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2518,6 +2518,15 @@ def test_parquet_reader_binary_decimal(datadir): assert_eq(expect, got) +def test_parquet_reader_rle_boolean(datadir): + fname = datadir / "rle_boolean_encoding.parquet" + + expect = pd.read_parquet(fname) + got = cudf.read_parquet(fname) + + assert_eq(expect, got) + + # testing a specific bug-fix/edge case. # specifically: int a parquet file containing a particular way of representing # a list column in a schema, the cudf reader was confusing