From 91aeec88deac8168f75a7a9d740eceba61df45bd Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 25 Oct 2023 14:37:21 -0500 Subject: [PATCH] Drop `pyorc` dependency and use `pandas`/`pyarrow` instead (#14323) This PR removes dependency on `pyorc` in `cudf` altogether by using drop-in replacements found in `pandas` & `pyarrow`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Jake Awe (https://github.com/AyodeAwe) - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14323 --- .../all_cuda-118_arch-x86_64.yaml | 1 - .../all_cuda-120_arch-x86_64.yaml | 1 - cpp/tests/io/orc_test.cpp | 14 +- dependencies.yaml | 1 - docs/cudf/source/conf.py | 1 + docs/dask_cudf/source/conf.py | 1 + python/cudf/cudf/_fuzz_testing/orc.py | 18 +- python/cudf/cudf/_fuzz_testing/utils.py | 160 +------ python/cudf/cudf/tests/test_orc.py | 402 ++++++++---------- python/cudf/pyproject.toml | 1 - 10 files changed, 205 insertions(+), 395 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index b5782800946..8b6b32bc026 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -70,7 +70,6 @@ dependencies: - ptxcompiler - pyarrow==12.0.1.* - pydata-sphinx-theme -- pyorc - pytest - pytest-benchmark - pytest-cases diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index 473b9d07d88..ae15a6e97ab 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -68,7 +68,6 @@ dependencies: - protobuf>=4.21,<5 - pyarrow==12.0.1.* - pydata-sphinx-theme -- pyorc - pytest - pytest-benchmark - pytest-cases diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 890ef914713..3457c5675ad 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1299,20 +1299,16 @@ TEST_F(OrcStatisticsTest, Overflow) TEST_F(OrcStatisticsTest, HasNull) { - // This test can now be implemented with libcudf; keeping the pyorc version to keep the test + // This test can now be implemented with libcudf; keeping the pandas version to keep the test // inputs diversified // Method to create file: - // >>> import pyorc - // >>> output = open("./temp.orc", "wb") - // >>> writer = pyorc.Writer(output, pyorc.Struct(a=pyorc.BigInt(), b=pyorc.BigInt())) - // >>> writer.write((1, 3)) - // >>> writer.write((2, 4)) - // >>> writer.write((None, 5)) - // >>> writer.close() + // >>> import pandas as pd + // >>> df = pd.DataFrame({'a':pd.Series([1, 2, None], dtype="Int64"), 'b':[3, 4, 5]}) + // >>> df.to_orc("temp.orc") // // Contents of file: // >>> import pyarrow.orc as po - // >>> po.ORCFile('new.orc').read() + // >>> po.ORCFile('temp.orc').read() // pyarrow.Table // a: int64 // b: int64 diff --git a/dependencies.yaml b/dependencies.yaml index c3223e4394d..a7716a15360 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -548,7 +548,6 @@ dependencies: - fastavro>=0.22.9 - hypothesis - mimesis>=4.1.0 - - pyorc - pytest-benchmark - pytest-cases - python-snappy>=0.6.0 diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index acb2a5d17f3..28e305b71cb 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -106,6 +106,7 @@ "twitter_url": "https://twitter.com/rapidsai", "show_toc_level": 1, "navbar_align": "right", + "navigation_with_keys": True, } include_pandas_compat = True diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py index 6861a9b90f6..00568a57431 100644 --- a/docs/dask_cudf/source/conf.py +++ b/docs/dask_cudf/source/conf.py @@ -57,6 +57,7 @@ "twitter_url": "https://twitter.com/rapidsai", "show_toc_level": 1, "navbar_align": "right", + "navigation_with_keys": True, } include_pandas_compat = True diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index 65d2e09988f..ecddc72fa85 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import copy import io @@ -6,14 +6,13 @@ import random import numpy as np -import pyorc +import pyarrow as pa import cudf from cudf._fuzz_testing.io import IOFuzz from cudf._fuzz_testing.utils import ( ALL_POSSIBLE_VALUES, _generate_rand_meta, - pandas_to_orc, pyarrow_to_pandas, ) from cudf.testing import dataset_generator as dg @@ -82,12 +81,7 @@ def generate_input(self): logging.info(f"Shape of DataFrame generated: {table.shape}") self._df = df file_obj = io.BytesIO() - pandas_to_orc( - df, - file_io_obj=file_obj, - stripe_size=self._rand(len(df)), - arrow_table_schema=table.schema, - ) + pa.orc.write_table(table, file_obj, stripe_size=self._rand(len(df))) file_obj.seek(0) buf = file_obj.read() self._current_buffer = copy.copy(buf) @@ -109,8 +103,8 @@ def set_rand_params(self, params): ) elif param == "stripes": f = io.BytesIO(self._current_buffer) - reader = pyorc.Reader(f) - stripes = [i for i in range(reader.num_of_stripes)] + orcFile = pa.orc.ORCFile(f) + stripes = list(range(orcFile.nstripes)) params_dict[param] = np.random.choice( [ None, @@ -119,7 +113,7 @@ def set_rand_params(self, params): int, np.unique( np.random.choice( - stripes, reader.num_of_stripes + stripes, orcFile.nstripes ) ), ) diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 03418e00cde..0c88c1aeacd 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -1,13 +1,11 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. import random -from collections import OrderedDict import fastavro import numpy as np import pandas as pd import pyarrow as pa -import pyorc import cudf from cudf.testing._utils import assert_eq @@ -41,40 +39,6 @@ cudf.dtype(" can result in incorrect dtype by pandas. - df = df.astype(dtypes) + orc_file = pa.orc.ORCFile(f) + records = [orc_file.read_stripe(i) for i in stripes] + pa_table = pa.Table.from_batches(records) + df = pa_table.to_pandas() return df diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 07aa5430f4f..7407da9c4ac 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -10,8 +10,6 @@ import numpy as np import pandas as pd import pyarrow as pa -import pyarrow.orc -import pyorc import pytest import cudf @@ -150,9 +148,11 @@ def test_orc_reader_trailing_nulls(datadir): ["TestOrcFile.testDate1900.orc", "TestOrcFile.testDate2038.orc"], ) def test_orc_reader_datetimestamp(datadir, inputfile, use_index): + from pyarrow import orc + path = datadir / inputfile try: - orcfile = pa.orc.ORCFile(path) + orcfile = orc.ORCFile(path) except pa.ArrowIOError as e: pytest.skip(".orc file is not found: %s" % e) @@ -295,28 +295,29 @@ def test_orc_read_rows(datadir, skiprows, num_rows): def test_orc_read_skiprows(): buff = BytesIO() - data = [ - True, - False, - True, - False, - None, - True, - True, - True, - False, - None, - False, - False, - True, - True, - True, - True, - ] - writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean())) - writer.writerows([(d,) for d in data]) - writer.close() - + df = pd.DataFrame( + { + "a": [ + True, + False, + True, + False, + None, + True, + True, + True, + False, + None, + False, + False, + True, + True, + True, + True, + ] + } + ) + df.to_orc(buff) # testing 10 skiprows due to a boolean specific bug fix that didn't # repro for other sizes of data skiprows = 10 @@ -605,6 +606,8 @@ def normalized_equals(value1, value2): @pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"]) @pytest.mark.parametrize("nrows", [1, 100, 6000000]) def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): + from pyarrow import orc + supported_stat_types = supported_numpy_dtypes + ["str"] # Can't write random bool columns until issue #6763 is fixed if nrows == 6000000: @@ -623,7 +626,7 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): gdf.to_orc(fname.strpath, statistics=stats_freq) # Read back written ORC's statistics - orc_file = pa.orc.ORCFile(fname) + orc_file = orc.ORCFile(fname) ( file_stats, stripes_stats, @@ -677,6 +680,8 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): @pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"]) @pytest.mark.parametrize("nrows", [2, 100, 6000000]) def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): + from pyarrow import orc + np.random.seed(0) supported_stat_types = supported_numpy_dtypes + ["str"] # Can't write random bool columns until issue #6763 is fixed @@ -729,7 +734,7 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): expect = cudf.DataFrame(pd.concat([pdf1, pdf2]).reset_index(drop=True)) # Read back written ORC's statistics - orc_file = pa.orc.ORCFile(gdf_fname) + orc_file = orc.ORCFile(gdf_fname) ( file_stats, stripes_stats, @@ -782,6 +787,8 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): @pytest.mark.parametrize("nrows", [1, 100, 6000000]) def test_orc_write_bool_statistics(tmpdir, datadir, nrows): + from pyarrow import orc + # Make a dataframe gdf = cudf.DataFrame({"col_bool": gen_rand_series("bool", nrows)}) fname = tmpdir.join("gdf.orc") @@ -790,7 +797,7 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows): gdf.to_orc(fname.strpath) # Read back written ORC's statistics - orc_file = pa.orc.ORCFile(fname) + orc_file = orc.ORCFile(fname) ( file_stats, stripes_stats, @@ -978,44 +985,12 @@ def test_orc_string_stream_offset_issue(): assert_eq(df, cudf.read_orc(buffer)) -# Data is generated using pyorc module def generate_list_struct_buff(size=100_000): rd = random.Random(1) np.random.seed(seed=1) buff = BytesIO() - schema = { - "lvl3_list": pyorc.Array(pyorc.Array(pyorc.Array(pyorc.BigInt()))), - "lvl1_list": pyorc.Array(pyorc.BigInt()), - "lvl1_struct": pyorc.Struct( - **{"a": pyorc.BigInt(), "b": pyorc.BigInt()} - ), - "lvl2_struct": pyorc.Struct( - **{ - "a": pyorc.BigInt(), - "lvl1_struct": pyorc.Struct( - **{"c": pyorc.BigInt(), "d": pyorc.BigInt()} - ), - } - ), - "list_nests_struct": pyorc.Array( - pyorc.Array( - pyorc.Struct(**{"a": pyorc.BigInt(), "b": pyorc.BigInt()}) - ) - ), - "struct_nests_list": pyorc.Struct( - **{ - "struct": pyorc.Struct( - **{"a": pyorc.BigInt(), "b": pyorc.BigInt()} - ), - "list": pyorc.Array(pyorc.BigInt()), - } - ), - } - - schema = pyorc.Struct(**schema) - lvl3_list = [ rd.choice( [ @@ -1024,50 +999,57 @@ def generate_list_struct_buff(size=100_000): [ [ rd.choice([None, np.random.randint(1, 3)]) - for z in range(np.random.randint(1, 3)) + for _ in range(np.random.randint(1, 3)) ] - for z in range(np.random.randint(0, 3)) + for _ in range(np.random.randint(0, 3)) ] - for y in range(np.random.randint(0, 3)) + for _ in range(np.random.randint(0, 3)) ], ] ) - for x in range(size) + for _ in range(size) ] lvl1_list = [ [ rd.choice([None, np.random.randint(0, 3)]) - for y in range(np.random.randint(1, 4)) + for _ in range(np.random.randint(1, 4)) ] - for x in range(size) + for _ in range(size) ] lvl1_struct = [ - rd.choice([None, (np.random.randint(0, 3), np.random.randint(0, 3))]) - for x in range(size) + rd.choice( + [ + None, + {"a": np.random.randint(0, 3), "b": np.random.randint(0, 3)}, + ] + ) + for _ in range(size) ] lvl2_struct = [ rd.choice( [ None, - ( - rd.choice([None, np.random.randint(0, 3)]), - ( - rd.choice([None, np.random.randint(0, 3)]), - np.random.randint(0, 3), - ), - ), + {"a": rd.choice([None, np.random.randint(0, 3)])}, + { + "lvl1_struct": { + "c": rd.choice([None, np.random.randint(0, 3)]), + "d": np.random.randint(0, 3), + }, + }, ] ) - for x in range(size) + for _ in range(size) ] list_nests_struct = [ [ - [rd.choice(lvl1_struct), rd.choice(lvl1_struct)] - for y in range(np.random.randint(1, 4)) + {"a": rd.choice(lvl1_struct), "b": rd.choice(lvl1_struct)} + for _ in range(np.random.randint(1, 4)) ] - for x in range(size) + for _ in range(size) + ] + struct_nests_list = [ + {"struct": lvl1_struct[x], "list": lvl1_list[x]} for x in range(size) ] - struct_nests_list = [(lvl1_struct[x], lvl1_list[x]) for x in range(size)] df = pd.DataFrame( { @@ -1080,15 +1062,7 @@ def generate_list_struct_buff(size=100_000): } ) - writer = pyorc.Writer(buff, schema, stripe_size=1024) - tuples = list( - map( - lambda x: (None,) if x[0] is pd.NA else x, - list(df.itertuples(index=False, name=None)), - ) - ) - writer.writerows(tuples) - writer.close() + df.to_orc(buff, engine="pyarrow", engine_kwargs={"stripe_size": 1024}) return buff @@ -1109,6 +1083,8 @@ def list_struct_buff(): @pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100_000]) @pytest.mark.parametrize("use_index", [True, False]) def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff): + from pyarrow import orc + gdf = cudf.read_orc( list_struct_buff, columns=columns, @@ -1116,7 +1092,7 @@ def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff): use_index=use_index, ) - pyarrow_tbl = pyarrow.orc.ORCFile(list_struct_buff).read() + pyarrow_tbl = orc.ORCFile(list_struct_buff).read() pyarrow_tbl = ( pyarrow_tbl[:num_rows] @@ -1155,111 +1131,96 @@ def test_pyspark_struct(datadir): def gen_map_buff(size=10000): from string import ascii_letters as al + from pyarrow import orc + rd = random.Random(1) np.random.seed(seed=1) buff = BytesIO() - schema = { - "lvl1_map": pyorc.Map(key=pyorc.String(), value=pyorc.BigInt()), - "lvl2_map": pyorc.Map( - key=pyorc.String(), value=pyorc.Array(pyorc.BigInt()) - ), - "lvl2_struct_map": pyorc.Map( - key=pyorc.String(), - value=pyorc.Struct(**{"a": pyorc.BigInt(), "b": pyorc.BigInt()}), - ), - } - - schema = pyorc.Struct(**schema) - - lvl1_map = [ - rd.choice( - [ - None, - [ - ( - rd.choice(al), - rd.choice([None, np.random.randint(1, 1500)]), - ) - for y in range(2) - ], - ] - ) - for x in range(size) - ] - lvl2_map = [ - rd.choice( - [ - None, + lvl1_map = pa.array( + [ + rd.choice( [ - ( - rd.choice(al), - rd.choice( - [ - None, - [ - rd.choice( - [None, np.random.randint(1, 1500)] - ) - for z in range(5) - ], - ] + None, + { + rd.choice(al): rd.choice( + [None, np.random.randint(1, 1500)] ), - ) - for y in range(2) - ], - ] - ) - for x in range(size) - ] - lvl2_struct_map = [ - rd.choice( - [ - None, + }, + ] + ) + for _ in range(size) + ], + type=pa.map_(pa.string(), pa.int64()), + ) + lvl2_map = pa.array( + [ + rd.choice( [ - ( - rd.choice(al), - rd.choice( - [ - None, - ( - rd.choice( - [None, np.random.randint(1, 1500)] - ), - rd.choice( - [None, np.random.randint(1, 1500)] - ), - ), - ] - ), - ) - for y in range(2) - ], - ] - ) - for x in range(size) - ] - - pdf = pd.DataFrame( - { - "lvl1_map": lvl1_map, - "lvl2_map": lvl2_map, - "lvl2_struct_map": lvl2_struct_map, - } + None, + *( + { + rd.choice(al): rd.choice( + [ + None, + [ + rd.choice( + [None, np.random.randint(1, 1500)] + ) + for _ in range(5) + ], + ] + ) + } + for _ in range(2) + ), + ] + ) + for _ in range(size) + ], + type=pa.map_(pa.string(), pa.list_(pa.int64())), ) - writer = pyorc.Writer( - buff, schema, stripe_size=1024, compression=pyorc.CompressionKind.NONE + lvl2_struct_map = pa.array( + [ + rd.choice( + [ + None, + *( + { + rd.choice(al): rd.choice( + [ + None, + { + "a": rd.choice( + [None, np.random.randint(1, 1500)] + ), + "b": rd.choice( + [None, np.random.randint(1, 1500)] + ), + }, + ] + ) + } + for _ in range(2) + ), + ] + ) + for _ in range(size) + ], + type=pa.map_( + pa.string(), pa.struct({"a": pa.int64(), "b": pa.int64()}) + ), ) - tuples = list( - map( - lambda x: (None,) if x[0] is pd.NA else x, - list(pdf.itertuples(index=False, name=None)), - ) + + pa_table = pa.Table.from_arrays( + [lvl1_map, lvl2_map, lvl2_struct_map], + ["lvl1_map", "lvl2_map", "lvl2_struct_map"], ) - writer.writerows(tuples) - writer.close() + orc.write_table( + pa_table, buff, stripe_size=1024, compression="UNCOMPRESSED" + ) return buff @@ -1274,7 +1235,9 @@ def gen_map_buff(size=10000): @pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100000]) @pytest.mark.parametrize("use_index", [True, False]) def test_map_type_read(columns, num_rows, use_index): - tbl = pa.orc.ORCFile(map_buff).read() + from pyarrow import orc + + tbl = orc.read_table(map_buff) lvl1_map = ( tbl["lvl1_map"] @@ -1460,18 +1423,22 @@ def test_writer_timestamp_stream_size(datadir, tmpdir): ], ) def test_no_row_group_index_orc_read(datadir, fname): + from pyarrow import orc + fpath = datadir / fname - expect = pa.orc.ORCFile(fpath).read() + expect = orc.ORCFile(fpath).read() got = cudf.read_orc(fpath) assert expect.equals(got.to_arrow()) def test_names_in_struct_dtype_nesting(datadir): + from pyarrow import orc + fname = datadir / "TestOrcFile.NestedStructDataFrame.orc" - expect = pa.orc.ORCFile(fname).read() + expect = orc.ORCFile(fname).read() got = cudf.read_orc(fname) # test dataframes @@ -1483,12 +1450,14 @@ def test_names_in_struct_dtype_nesting(datadir): def test_writer_lists_structs(list_struct_buff): + from pyarrow import orc + df_in = cudf.read_orc(list_struct_buff) buff = BytesIO() df_in.to_orc(buff) - pyarrow_tbl = pyarrow.orc.ORCFile(buff).read() + pyarrow_tbl = orc.ORCFile(buff).read() assert pyarrow_tbl.equals(df_in.to_arrow()) @@ -1527,12 +1496,10 @@ def test_statistics_sum_overflow(): minint64 = np.iinfo(np.int64).min buff = BytesIO() - with pyorc.Writer( - buff, - pyorc.Struct(a=pyorc.BigInt(), b=pyorc.BigInt(), c=pyorc.BigInt()), - ) as writer: - writer.write((maxint64, minint64, minint64)) - writer.write((1, -1, 1)) + df = pd.DataFrame( + {"a": [maxint64, 1], "b": [minint64, -1], "c": [minint64, 1]} + ) + df.to_orc(buff) file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff]) assert file_stats[0]["a"].get("sum") is None @@ -1545,22 +1512,24 @@ def test_statistics_sum_overflow(): def test_empty_statistics(): + from pyarrow import orc + buff = BytesIO() - orc_schema = pyorc.Struct( - a=pyorc.BigInt(), - b=pyorc.Double(), - c=pyorc.String(), - d=pyorc.Decimal(11, 2), - e=pyorc.Date(), - f=pyorc.Timestamp(), - g=pyorc.Boolean(), - h=pyorc.Binary(), - i=pyorc.BigInt(), - # One column with non null value, else cudf/pyorc readers crash + pa_table = pa.Table.from_arrays( + [ + pa.array([None], type=pa.int64()), + pa.array([None], type=pa.float64()), + pa.array([None], type=pa.string()), + pa.array([None], type=pa.decimal128(11, 2)), + pa.array([None], type=pa.timestamp("ns")), + pa.array([None], type=pa.date64()), + pa.array([None], type=pa.bool_()), + pa.array([None], type=pa.binary()), + pa.array([1], type=pa.int64()), + ], + ["a", "b", "c", "d", "e", "f", "g", "h", "i"], ) - data = tuple([None] * (len(orc_schema.fields) - 1) + [1]) - with pyorc.Writer(buff, orc_schema) as writer: - writer.write(data) + orc.write_table(pa_table, buff) got = cudf.io.orc.read_orc_statistics([buff]) @@ -1615,6 +1584,8 @@ def test_select_nested(list_struct_buff, equivalent_columns): def test_orc_writer_rle_stream_size(datadir, tmpdir): + from pyarrow import orc + original = datadir / "TestOrcFile.int16.rle.size.orc" reencoded = tmpdir.join("int16_map.orc") @@ -1622,7 +1593,7 @@ def test_orc_writer_rle_stream_size(datadir, tmpdir): df.to_orc(reencoded) # Segfaults when RLE stream sizes don't account for varint length - pa_out = pa.orc.ORCFile(reencoded).read() + pa_out = orc.ORCFile(reencoded).read() assert df.to_arrow().equals(pa_out) @@ -1642,11 +1613,13 @@ def test_empty_columns(): def test_orc_reader_zstd_compression(list_struct_buff): + from pyarrow import orc + expected = cudf.read_orc(list_struct_buff) # save with ZSTD compression buffer = BytesIO() - pyarrow_tbl = pyarrow.orc.ORCFile(list_struct_buff).read() - writer = pyarrow.orc.ORCWriter(buffer, compression="zstd") + pyarrow_tbl = orc.ORCFile(list_struct_buff).read() + writer = orc.ORCWriter(buffer, compression="zstd") writer.write(pyarrow_tbl) writer.close() try: @@ -1845,10 +1818,7 @@ def negative_timestamp_df(): @pytest.mark.parametrize("engine", ["cudf", "pyarrow"]) def test_orc_reader_negative_timestamp(negative_timestamp_df, engine): buffer = BytesIO() - pyorc_table = pa.Table.from_pandas( - negative_timestamp_df.to_pandas(), preserve_index=False - ) - pyarrow.orc.write_table(pyorc_table, buffer) + negative_timestamp_df.to_orc(buffer) # We warn the user that this function will fall back to the CPU for reading # when the engine is pyarrow. @@ -1859,11 +1829,13 @@ def test_orc_reader_negative_timestamp(negative_timestamp_df, engine): def test_orc_writer_negative_timestamp(negative_timestamp_df): + from pyarrow import orc + buffer = BytesIO() negative_timestamp_df.to_orc(buffer) assert_eq(negative_timestamp_df, pd.read_orc(buffer)) - assert_eq(negative_timestamp_df, pyarrow.orc.ORCFile(buffer).read()) + assert_eq(negative_timestamp_df, orc.ORCFile(buffer).read()) def test_orc_reader_apache_negative_timestamp(datadir): diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 39a8dca0267..90759074750 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -58,7 +58,6 @@ test = [ "hypothesis", "mimesis>=4.1.0", "msgpack", - "pyorc", "pytest", "pytest-benchmark", "pytest-cases",