From 91aeec88deac8168f75a7a9d740eceba61df45bd Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 25 Oct 2023 14:37:21 -0500
Subject: [PATCH] Drop `pyorc` dependency and use `pandas`/`pyarrow` instead
 (#14323)

This PR removes dependency on `pyorc` in `cudf` altogether by using drop-in replacements found in `pandas` & `pyarrow`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Jake Awe (https://github.com/AyodeAwe)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14323
---
 .../all_cuda-118_arch-x86_64.yaml             |   1 -
 .../all_cuda-120_arch-x86_64.yaml             |   1 -
 cpp/tests/io/orc_test.cpp                     |  14 +-
 dependencies.yaml                             |   1 -
 docs/cudf/source/conf.py                      |   1 +
 docs/dask_cudf/source/conf.py                 |   1 +
 python/cudf/cudf/_fuzz_testing/orc.py         |  18 +-
 python/cudf/cudf/_fuzz_testing/utils.py       | 160 +------
 python/cudf/cudf/tests/test_orc.py            | 402 ++++++++----------
 python/cudf/pyproject.toml                    |   1 -
 10 files changed, 205 insertions(+), 395 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index b5782800946..8b6b32bc026 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -70,7 +70,6 @@ dependencies:
 - ptxcompiler
 - pyarrow==12.0.1.*
 - pydata-sphinx-theme
-- pyorc
 - pytest
 - pytest-benchmark
 - pytest-cases
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 473b9d07d88..ae15a6e97ab 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -68,7 +68,6 @@ dependencies:
 - protobuf>=4.21,<5
 - pyarrow==12.0.1.*
 - pydata-sphinx-theme
-- pyorc
 - pytest
 - pytest-benchmark
 - pytest-cases
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 890ef914713..3457c5675ad 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1299,20 +1299,16 @@ TEST_F(OrcStatisticsTest, Overflow)
 
 TEST_F(OrcStatisticsTest, HasNull)
 {
-  // This test can now be implemented with libcudf; keeping the pyorc version to keep the test
+  // This test can now be implemented with libcudf; keeping the pandas version to keep the test
   // inputs diversified
   // Method to create file:
-  // >>> import pyorc
-  // >>> output = open("./temp.orc", "wb")
-  // >>> writer = pyorc.Writer(output, pyorc.Struct(a=pyorc.BigInt(), b=pyorc.BigInt()))
-  // >>> writer.write((1, 3))
-  // >>> writer.write((2, 4))
-  // >>> writer.write((None, 5))
-  // >>> writer.close()
+  // >>> import pandas as pd
+  // >>> df = pd.DataFrame({'a':pd.Series([1, 2, None], dtype="Int64"), 'b':[3, 4, 5]})
+  // >>> df.to_orc("temp.orc")
   //
   // Contents of file:
   // >>> import pyarrow.orc as po
-  // >>> po.ORCFile('new.orc').read()
+  // >>> po.ORCFile('temp.orc').read()
   // pyarrow.Table
   // a: int64
   // b: int64
diff --git a/dependencies.yaml b/dependencies.yaml
index c3223e4394d..a7716a15360 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -548,7 +548,6 @@ dependencies:
           - fastavro>=0.22.9
           - hypothesis
           - mimesis>=4.1.0
-          - pyorc
           - pytest-benchmark
           - pytest-cases
           - python-snappy>=0.6.0
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index acb2a5d17f3..28e305b71cb 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -106,6 +106,7 @@
     "twitter_url": "https://twitter.com/rapidsai",
     "show_toc_level": 1,
     "navbar_align": "right",
+    "navigation_with_keys": True,
 }
 include_pandas_compat = True
 
diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py
index 6861a9b90f6..00568a57431 100644
--- a/docs/dask_cudf/source/conf.py
+++ b/docs/dask_cudf/source/conf.py
@@ -57,6 +57,7 @@
     "twitter_url": "https://twitter.com/rapidsai",
     "show_toc_level": 1,
     "navbar_align": "right",
+    "navigation_with_keys": True,
 }
 include_pandas_compat = True
 
diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py
index 65d2e09988f..ecddc72fa85 100644
--- a/python/cudf/cudf/_fuzz_testing/orc.py
+++ b/python/cudf/cudf/_fuzz_testing/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import copy
 import io
@@ -6,14 +6,13 @@
 import random
 
 import numpy as np
-import pyorc
+import pyarrow as pa
 
 import cudf
 from cudf._fuzz_testing.io import IOFuzz
 from cudf._fuzz_testing.utils import (
     ALL_POSSIBLE_VALUES,
     _generate_rand_meta,
-    pandas_to_orc,
     pyarrow_to_pandas,
 )
 from cudf.testing import dataset_generator as dg
@@ -82,12 +81,7 @@ def generate_input(self):
         logging.info(f"Shape of DataFrame generated: {table.shape}")
         self._df = df
         file_obj = io.BytesIO()
-        pandas_to_orc(
-            df,
-            file_io_obj=file_obj,
-            stripe_size=self._rand(len(df)),
-            arrow_table_schema=table.schema,
-        )
+        pa.orc.write_table(table, file_obj, stripe_size=self._rand(len(df)))
         file_obj.seek(0)
         buf = file_obj.read()
         self._current_buffer = copy.copy(buf)
@@ -109,8 +103,8 @@ def set_rand_params(self, params):
                     )
                 elif param == "stripes":
                     f = io.BytesIO(self._current_buffer)
-                    reader = pyorc.Reader(f)
-                    stripes = [i for i in range(reader.num_of_stripes)]
+                    orcFile = pa.orc.ORCFile(f)
+                    stripes = list(range(orcFile.nstripes))
                     params_dict[param] = np.random.choice(
                         [
                             None,
@@ -119,7 +113,7 @@ def set_rand_params(self, params):
                                     int,
                                     np.unique(
                                         np.random.choice(
-                                            stripes, reader.num_of_stripes
+                                            stripes, orcFile.nstripes
                                         )
                                     ),
                                 )
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 03418e00cde..0c88c1aeacd 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -1,13 +1,11 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import random
-from collections import OrderedDict
 
 import fastavro
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-import pyorc
 
 import cudf
 from cudf.testing._utils import assert_eq
@@ -41,40 +39,6 @@
     cudf.dtype("<M8[us]"): {"type": "long", "logicalType": "timestamp-micros"},
 }
 
-PANDAS_TO_ORC_TYPES = {
-    cudf.dtype("int8"): pyorc.TinyInt(),
-    pd.Int8Dtype(): pyorc.TinyInt(),
-    pd.Int16Dtype(): pyorc.SmallInt(),
-    pd.Int32Dtype(): pyorc.Int(),
-    pd.Int64Dtype(): pyorc.BigInt(),
-    pd.Float32Dtype(): pyorc.Float(),
-    pd.Float64Dtype(): pyorc.Double(),
-    pd.BooleanDtype(): pyorc.Boolean(),
-    cudf.dtype("bool_"): pyorc.Boolean(),
-    cudf.dtype("int16"): pyorc.SmallInt(),
-    cudf.dtype("int32"): pyorc.Int(),
-    cudf.dtype("int64"): pyorc.BigInt(),
-    cudf.dtype("O"): pyorc.String(),
-    pd.StringDtype(): pyorc.String(),
-    cudf.dtype("float32"): pyorc.Float(),
-    cudf.dtype("float64"): pyorc.Double(),
-    cudf.dtype("<M8[ns]"): pyorc.Timestamp(),
-    cudf.dtype("<M8[ms]"): pyorc.Timestamp(),
-    cudf.dtype("<M8[us]"): pyorc.Timestamp(),
-}
-
-ORC_TO_PANDAS_TYPES = {
-    pyorc.TinyInt().name: pd.Int8Dtype(),
-    pyorc.Int().name: pd.Int32Dtype(),
-    pyorc.Boolean().name: pd.BooleanDtype(),
-    pyorc.SmallInt().name: pd.Int16Dtype(),
-    pyorc.BigInt().name: pd.Int64Dtype(),
-    pyorc.String().name: pd.StringDtype(),
-    pyorc.Float().name: pd.Float32Dtype(),
-    pyorc.Double().name: pd.Float64Dtype(),
-    pyorc.Timestamp().name: cudf.dtype("<M8[ns]"),
-}
-
 
 def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
     obj._current_params = {}
@@ -213,24 +177,6 @@ def get_avro_dtype_info(dtype):
         )
 
 
-def get_orc_dtype_info(dtype):
-    if dtype in PANDAS_TO_ORC_TYPES:
-        return PANDAS_TO_ORC_TYPES[dtype]
-    else:
-        raise TypeError(
-            f"Unsupported dtype({dtype}) according to orc spec:"
-            f" https://orc.apache.org/specification/"
-        )
-
-
-def get_arrow_dtype_info_for_pyorc(dtype):
-    if isinstance(dtype, pa.StructType):
-        return get_orc_schema(df=None, arrow_table_schema=dtype)
-    else:
-        pd_dtype = cudf.dtype(dtype.to_pandas_dtype())
-        return get_orc_dtype_info(pd_dtype)
-
-
 def get_avro_schema(df):
     fields = [
         {"name": col_name, "type": get_avro_dtype_info(col_dtype)}
@@ -240,22 +186,6 @@ def get_avro_schema(df):
     return schema
 
 
-def get_orc_schema(df, arrow_table_schema=None):
-    if arrow_table_schema is None:
-        ordered_dict = OrderedDict(
-            (col_name, get_orc_dtype_info(col_dtype))
-            for col_name, col_dtype in df.dtypes.items()
-        )
-    else:
-        ordered_dict = OrderedDict(
-            (field.name, get_arrow_dtype_info_for_pyorc(field.type))
-            for field in arrow_table_schema
-        )
-
-    schema = pyorc.Struct(**ordered_dict)
-    return schema
-
-
 def convert_nulls_to_none(records, df):
     columns_with_nulls = {col for col in df.columns if df[col].isnull().any()}
     scalar_columns_convert = [
@@ -296,99 +226,19 @@ def pandas_to_avro(df, file_name=None, file_io_obj=None):
         fastavro.writer(file_io_obj, avro_schema, records)
 
 
-def _preprocess_to_orc_tuple(df, arrow_table_schema):
-    def _null_to_None(value):
-        if value is pd.NA or value is pd.NaT:
-            return None
-        else:
-            return value
-
-    def sanitize(value, struct_type):
-        if value is None:
-            return None
-
-        values_list = []
-        for name, sub_type in struct_type.fields.items():
-            if isinstance(sub_type, cudf.StructDtype):
-                values_list.append(sanitize(value[name], sub_type))
-            else:
-                values_list.append(value[name])
-        return tuple(values_list)
-
-    has_nulls_or_nullable_dtype = any(
-        (col := df[colname]).dtype in pandas_dtypes_to_np_dtypes
-        or col.isnull().any()
-        for colname in df.columns
-    )
-    pdf = df.copy(deep=True)
-    for field in arrow_table_schema:
-        if isinstance(field.type, pa.StructType):
-            pdf[field.name] = pdf[field.name].apply(
-                sanitize, args=(cudf.StructDtype.from_arrow(field.type),)
-            )
-        else:
-            pdf[field.name] = pdf[field.name]
-
-    tuple_list = [
-        tuple(map(_null_to_None, tup)) if has_nulls_or_nullable_dtype else tup
-        for tup in pdf.itertuples(index=False, name=None)
-    ]
-
-    return tuple_list, pdf, df
-
-
-def pandas_to_orc(
-    df,
-    file_name=None,
-    file_io_obj=None,
-    stripe_size=67108864,
-    arrow_table_schema=None,
-):
-    schema = get_orc_schema(df, arrow_table_schema=arrow_table_schema)
-
-    tuple_list, pdf, df = _preprocess_to_orc_tuple(
-        df, arrow_table_schema=arrow_table_schema
-    )
-
-    if file_name is not None:
-        with open(file_name, "wb") as data:
-            with pyorc.Writer(data, schema, stripe_size=stripe_size) as writer:
-                writer.writerows(tuple_list)
-    elif file_io_obj is not None:
-        with pyorc.Writer(
-            file_io_obj, schema, stripe_size=stripe_size
-        ) as writer:
-            writer.writerows(tuple_list)
-
-
 def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None):
     if file_name is not None:
         f = open(file_name, "rb")
     elif file_io_obj is not None:
         f = file_io_obj
 
-    reader = pyorc.Reader(f)
-
-    dtypes = {
-        col: ORC_TO_PANDAS_TYPES[pyorc_type.name]
-        for col, pyorc_type in reader.schema.fields.items()
-    }
-
     if stripes is None:
-        df = pd.DataFrame.from_records(
-            reader, columns=reader.schema.fields.keys()
-        )
+        df = pd.read_orc(f)
     else:
-        records = [
-            record for i in stripes for record in list(reader.read_stripe(i))
-        ]
-        df = pd.DataFrame.from_records(
-            records, columns=reader.schema.fields.keys()
-        )
-
-    # Need to type-cast to extracted `dtypes` from pyorc schema because
-    # a fully empty/ full <NA> can result in incorrect dtype by pandas.
-    df = df.astype(dtypes)
+        orc_file = pa.orc.ORCFile(f)
+        records = [orc_file.read_stripe(i) for i in stripes]
+        pa_table = pa.Table.from_batches(records)
+        df = pa_table.to_pandas()
 
     return df
 
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 07aa5430f4f..7407da9c4ac 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -10,8 +10,6 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-import pyarrow.orc
-import pyorc
 import pytest
 
 import cudf
@@ -150,9 +148,11 @@ def test_orc_reader_trailing_nulls(datadir):
     ["TestOrcFile.testDate1900.orc", "TestOrcFile.testDate2038.orc"],
 )
 def test_orc_reader_datetimestamp(datadir, inputfile, use_index):
+    from pyarrow import orc
+
     path = datadir / inputfile
     try:
-        orcfile = pa.orc.ORCFile(path)
+        orcfile = orc.ORCFile(path)
     except pa.ArrowIOError as e:
         pytest.skip(".orc file is not found: %s" % e)
 
@@ -295,28 +295,29 @@ def test_orc_read_rows(datadir, skiprows, num_rows):
 
 def test_orc_read_skiprows():
     buff = BytesIO()
-    data = [
-        True,
-        False,
-        True,
-        False,
-        None,
-        True,
-        True,
-        True,
-        False,
-        None,
-        False,
-        False,
-        True,
-        True,
-        True,
-        True,
-    ]
-    writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean()))
-    writer.writerows([(d,) for d in data])
-    writer.close()
-
+    df = pd.DataFrame(
+        {
+            "a": [
+                True,
+                False,
+                True,
+                False,
+                None,
+                True,
+                True,
+                True,
+                False,
+                None,
+                False,
+                False,
+                True,
+                True,
+                True,
+                True,
+            ]
+        }
+    )
+    df.to_orc(buff)
     # testing 10 skiprows due to a boolean specific bug fix that didn't
     # repro for other sizes of data
     skiprows = 10
@@ -605,6 +606,8 @@ def normalized_equals(value1, value2):
 @pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"])
 @pytest.mark.parametrize("nrows", [1, 100, 6000000])
 def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
+    from pyarrow import orc
+
     supported_stat_types = supported_numpy_dtypes + ["str"]
     # Can't write random bool columns until issue #6763 is fixed
     if nrows == 6000000:
@@ -623,7 +626,7 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
     gdf.to_orc(fname.strpath, statistics=stats_freq)
 
     # Read back written ORC's statistics
-    orc_file = pa.orc.ORCFile(fname)
+    orc_file = orc.ORCFile(fname)
     (
         file_stats,
         stripes_stats,
@@ -677,6 +680,8 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
 @pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"])
 @pytest.mark.parametrize("nrows", [2, 100, 6000000])
 def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
+    from pyarrow import orc
+
     np.random.seed(0)
     supported_stat_types = supported_numpy_dtypes + ["str"]
     # Can't write random bool columns until issue #6763 is fixed
@@ -729,7 +734,7 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
     expect = cudf.DataFrame(pd.concat([pdf1, pdf2]).reset_index(drop=True))
 
     # Read back written ORC's statistics
-    orc_file = pa.orc.ORCFile(gdf_fname)
+    orc_file = orc.ORCFile(gdf_fname)
     (
         file_stats,
         stripes_stats,
@@ -782,6 +787,8 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
 
 @pytest.mark.parametrize("nrows", [1, 100, 6000000])
 def test_orc_write_bool_statistics(tmpdir, datadir, nrows):
+    from pyarrow import orc
+
     # Make a dataframe
     gdf = cudf.DataFrame({"col_bool": gen_rand_series("bool", nrows)})
     fname = tmpdir.join("gdf.orc")
@@ -790,7 +797,7 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows):
     gdf.to_orc(fname.strpath)
 
     # Read back written ORC's statistics
-    orc_file = pa.orc.ORCFile(fname)
+    orc_file = orc.ORCFile(fname)
     (
         file_stats,
         stripes_stats,
@@ -978,44 +985,12 @@ def test_orc_string_stream_offset_issue():
     assert_eq(df, cudf.read_orc(buffer))
 
 
-# Data is generated using pyorc module
 def generate_list_struct_buff(size=100_000):
     rd = random.Random(1)
     np.random.seed(seed=1)
 
     buff = BytesIO()
 
-    schema = {
-        "lvl3_list": pyorc.Array(pyorc.Array(pyorc.Array(pyorc.BigInt()))),
-        "lvl1_list": pyorc.Array(pyorc.BigInt()),
-        "lvl1_struct": pyorc.Struct(
-            **{"a": pyorc.BigInt(), "b": pyorc.BigInt()}
-        ),
-        "lvl2_struct": pyorc.Struct(
-            **{
-                "a": pyorc.BigInt(),
-                "lvl1_struct": pyorc.Struct(
-                    **{"c": pyorc.BigInt(), "d": pyorc.BigInt()}
-                ),
-            }
-        ),
-        "list_nests_struct": pyorc.Array(
-            pyorc.Array(
-                pyorc.Struct(**{"a": pyorc.BigInt(), "b": pyorc.BigInt()})
-            )
-        ),
-        "struct_nests_list": pyorc.Struct(
-            **{
-                "struct": pyorc.Struct(
-                    **{"a": pyorc.BigInt(), "b": pyorc.BigInt()}
-                ),
-                "list": pyorc.Array(pyorc.BigInt()),
-            }
-        ),
-    }
-
-    schema = pyorc.Struct(**schema)
-
     lvl3_list = [
         rd.choice(
             [
@@ -1024,50 +999,57 @@ def generate_list_struct_buff(size=100_000):
                     [
                         [
                             rd.choice([None, np.random.randint(1, 3)])
-                            for z in range(np.random.randint(1, 3))
+                            for _ in range(np.random.randint(1, 3))
                         ]
-                        for z in range(np.random.randint(0, 3))
+                        for _ in range(np.random.randint(0, 3))
                     ]
-                    for y in range(np.random.randint(0, 3))
+                    for _ in range(np.random.randint(0, 3))
                 ],
             ]
         )
-        for x in range(size)
+        for _ in range(size)
     ]
     lvl1_list = [
         [
             rd.choice([None, np.random.randint(0, 3)])
-            for y in range(np.random.randint(1, 4))
+            for _ in range(np.random.randint(1, 4))
         ]
-        for x in range(size)
+        for _ in range(size)
     ]
     lvl1_struct = [
-        rd.choice([None, (np.random.randint(0, 3), np.random.randint(0, 3))])
-        for x in range(size)
+        rd.choice(
+            [
+                None,
+                {"a": np.random.randint(0, 3), "b": np.random.randint(0, 3)},
+            ]
+        )
+        for _ in range(size)
     ]
     lvl2_struct = [
         rd.choice(
             [
                 None,
-                (
-                    rd.choice([None, np.random.randint(0, 3)]),
-                    (
-                        rd.choice([None, np.random.randint(0, 3)]),
-                        np.random.randint(0, 3),
-                    ),
-                ),
+                {"a": rd.choice([None, np.random.randint(0, 3)])},
+                {
+                    "lvl1_struct": {
+                        "c": rd.choice([None, np.random.randint(0, 3)]),
+                        "d": np.random.randint(0, 3),
+                    },
+                },
             ]
         )
-        for x in range(size)
+        for _ in range(size)
     ]
     list_nests_struct = [
         [
-            [rd.choice(lvl1_struct), rd.choice(lvl1_struct)]
-            for y in range(np.random.randint(1, 4))
+            {"a": rd.choice(lvl1_struct), "b": rd.choice(lvl1_struct)}
+            for _ in range(np.random.randint(1, 4))
         ]
-        for x in range(size)
+        for _ in range(size)
+    ]
+    struct_nests_list = [
+        {"struct": lvl1_struct[x], "list": lvl1_list[x]} for x in range(size)
     ]
-    struct_nests_list = [(lvl1_struct[x], lvl1_list[x]) for x in range(size)]
 
     df = pd.DataFrame(
         {
@@ -1080,15 +1062,7 @@ def generate_list_struct_buff(size=100_000):
         }
     )
 
-    writer = pyorc.Writer(buff, schema, stripe_size=1024)
-    tuples = list(
-        map(
-            lambda x: (None,) if x[0] is pd.NA else x,
-            list(df.itertuples(index=False, name=None)),
-        )
-    )
-    writer.writerows(tuples)
-    writer.close()
+    df.to_orc(buff, engine="pyarrow", engine_kwargs={"stripe_size": 1024})
 
     return buff
 
@@ -1109,6 +1083,8 @@ def list_struct_buff():
 @pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100_000])
 @pytest.mark.parametrize("use_index", [True, False])
 def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff):
+    from pyarrow import orc
+
     gdf = cudf.read_orc(
         list_struct_buff,
         columns=columns,
@@ -1116,7 +1092,7 @@ def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff):
         use_index=use_index,
     )
 
-    pyarrow_tbl = pyarrow.orc.ORCFile(list_struct_buff).read()
+    pyarrow_tbl = orc.ORCFile(list_struct_buff).read()
 
     pyarrow_tbl = (
         pyarrow_tbl[:num_rows]
@@ -1155,111 +1131,96 @@ def test_pyspark_struct(datadir):
 def gen_map_buff(size=10000):
     from string import ascii_letters as al
 
+    from pyarrow import orc
+
     rd = random.Random(1)
     np.random.seed(seed=1)
 
     buff = BytesIO()
 
-    schema = {
-        "lvl1_map": pyorc.Map(key=pyorc.String(), value=pyorc.BigInt()),
-        "lvl2_map": pyorc.Map(
-            key=pyorc.String(), value=pyorc.Array(pyorc.BigInt())
-        ),
-        "lvl2_struct_map": pyorc.Map(
-            key=pyorc.String(),
-            value=pyorc.Struct(**{"a": pyorc.BigInt(), "b": pyorc.BigInt()}),
-        ),
-    }
-
-    schema = pyorc.Struct(**schema)
-
-    lvl1_map = [
-        rd.choice(
-            [
-                None,
-                [
-                    (
-                        rd.choice(al),
-                        rd.choice([None, np.random.randint(1, 1500)]),
-                    )
-                    for y in range(2)
-                ],
-            ]
-        )
-        for x in range(size)
-    ]
-    lvl2_map = [
-        rd.choice(
-            [
-                None,
+    lvl1_map = pa.array(
+        [
+            rd.choice(
                 [
-                    (
-                        rd.choice(al),
-                        rd.choice(
-                            [
-                                None,
-                                [
-                                    rd.choice(
-                                        [None, np.random.randint(1, 1500)]
-                                    )
-                                    for z in range(5)
-                                ],
-                            ]
+                    None,
+                    {
+                        rd.choice(al): rd.choice(
+                            [None, np.random.randint(1, 1500)]
                         ),
-                    )
-                    for y in range(2)
-                ],
-            ]
-        )
-        for x in range(size)
-    ]
-    lvl2_struct_map = [
-        rd.choice(
-            [
-                None,
+                    },
+                ]
+            )
+            for _ in range(size)
+        ],
+        type=pa.map_(pa.string(), pa.int64()),
+    )
+    lvl2_map = pa.array(
+        [
+            rd.choice(
                 [
-                    (
-                        rd.choice(al),
-                        rd.choice(
-                            [
-                                None,
-                                (
-                                    rd.choice(
-                                        [None, np.random.randint(1, 1500)]
-                                    ),
-                                    rd.choice(
-                                        [None, np.random.randint(1, 1500)]
-                                    ),
-                                ),
-                            ]
-                        ),
-                    )
-                    for y in range(2)
-                ],
-            ]
-        )
-        for x in range(size)
-    ]
-
-    pdf = pd.DataFrame(
-        {
-            "lvl1_map": lvl1_map,
-            "lvl2_map": lvl2_map,
-            "lvl2_struct_map": lvl2_struct_map,
-        }
+                    None,
+                    *(
+                        {
+                            rd.choice(al): rd.choice(
+                                [
+                                    None,
+                                    [
+                                        rd.choice(
+                                            [None, np.random.randint(1, 1500)]
+                                        )
+                                        for _ in range(5)
+                                    ],
+                                ]
+                            )
+                        }
+                        for _ in range(2)
+                    ),
+                ]
+            )
+            for _ in range(size)
+        ],
+        type=pa.map_(pa.string(), pa.list_(pa.int64())),
     )
-    writer = pyorc.Writer(
-        buff, schema, stripe_size=1024, compression=pyorc.CompressionKind.NONE
+    lvl2_struct_map = pa.array(
+        [
+            rd.choice(
+                [
+                    None,
+                    *(
+                        {
+                            rd.choice(al): rd.choice(
+                                [
+                                    None,
+                                    {
+                                        "a": rd.choice(
+                                            [None, np.random.randint(1, 1500)]
+                                        ),
+                                        "b": rd.choice(
+                                            [None, np.random.randint(1, 1500)]
+                                        ),
+                                    },
+                                ]
+                            )
+                        }
+                        for _ in range(2)
+                    ),
+                ]
+            )
+            for _ in range(size)
+        ],
+        type=pa.map_(
+            pa.string(), pa.struct({"a": pa.int64(), "b": pa.int64()})
+        ),
     )
-    tuples = list(
-        map(
-            lambda x: (None,) if x[0] is pd.NA else x,
-            list(pdf.itertuples(index=False, name=None)),
-        )
+
+    pa_table = pa.Table.from_arrays(
+        [lvl1_map, lvl2_map, lvl2_struct_map],
+        ["lvl1_map", "lvl2_map", "lvl2_struct_map"],
     )
 
-    writer.writerows(tuples)
-    writer.close()
+    orc.write_table(
+        pa_table, buff, stripe_size=1024, compression="UNCOMPRESSED"
+    )
 
     return buff
 
@@ -1274,7 +1235,9 @@ def gen_map_buff(size=10000):
 @pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100000])
 @pytest.mark.parametrize("use_index", [True, False])
 def test_map_type_read(columns, num_rows, use_index):
-    tbl = pa.orc.ORCFile(map_buff).read()
+    from pyarrow import orc
+
+    tbl = orc.read_table(map_buff)
 
     lvl1_map = (
         tbl["lvl1_map"]
@@ -1460,18 +1423,22 @@ def test_writer_timestamp_stream_size(datadir, tmpdir):
     ],
 )
 def test_no_row_group_index_orc_read(datadir, fname):
+    from pyarrow import orc
+
     fpath = datadir / fname
 
-    expect = pa.orc.ORCFile(fpath).read()
+    expect = orc.ORCFile(fpath).read()
     got = cudf.read_orc(fpath)
 
     assert expect.equals(got.to_arrow())
 
 
 def test_names_in_struct_dtype_nesting(datadir):
+    from pyarrow import orc
+
     fname = datadir / "TestOrcFile.NestedStructDataFrame.orc"
 
-    expect = pa.orc.ORCFile(fname).read()
+    expect = orc.ORCFile(fname).read()
     got = cudf.read_orc(fname)
 
     # test dataframes
@@ -1483,12 +1450,14 @@ def test_names_in_struct_dtype_nesting(datadir):
 
 
 def test_writer_lists_structs(list_struct_buff):
+    from pyarrow import orc
+
     df_in = cudf.read_orc(list_struct_buff)
 
     buff = BytesIO()
     df_in.to_orc(buff)
 
-    pyarrow_tbl = pyarrow.orc.ORCFile(buff).read()
+    pyarrow_tbl = orc.ORCFile(buff).read()
 
     assert pyarrow_tbl.equals(df_in.to_arrow())
 
@@ -1527,12 +1496,10 @@ def test_statistics_sum_overflow():
     minint64 = np.iinfo(np.int64).min
 
     buff = BytesIO()
-    with pyorc.Writer(
-        buff,
-        pyorc.Struct(a=pyorc.BigInt(), b=pyorc.BigInt(), c=pyorc.BigInt()),
-    ) as writer:
-        writer.write((maxint64, minint64, minint64))
-        writer.write((1, -1, 1))
+    df = pd.DataFrame(
+        {"a": [maxint64, 1], "b": [minint64, -1], "c": [minint64, 1]}
+    )
+    df.to_orc(buff)
 
     file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff])
     assert file_stats[0]["a"].get("sum") is None
@@ -1545,22 +1512,24 @@ def test_statistics_sum_overflow():
 
 
 def test_empty_statistics():
+    from pyarrow import orc
+
     buff = BytesIO()
-    orc_schema = pyorc.Struct(
-        a=pyorc.BigInt(),
-        b=pyorc.Double(),
-        c=pyorc.String(),
-        d=pyorc.Decimal(11, 2),
-        e=pyorc.Date(),
-        f=pyorc.Timestamp(),
-        g=pyorc.Boolean(),
-        h=pyorc.Binary(),
-        i=pyorc.BigInt(),
-        # One column with non null value, else cudf/pyorc readers crash
+    pa_table = pa.Table.from_arrays(
+        [
+            pa.array([None], type=pa.int64()),
+            pa.array([None], type=pa.float64()),
+            pa.array([None], type=pa.string()),
+            pa.array([None], type=pa.decimal128(11, 2)),
+            pa.array([None], type=pa.timestamp("ns")),
+            pa.array([None], type=pa.date64()),
+            pa.array([None], type=pa.bool_()),
+            pa.array([None], type=pa.binary()),
+            pa.array([1], type=pa.int64()),
+        ],
+        ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
     )
-    data = tuple([None] * (len(orc_schema.fields) - 1) + [1])
-    with pyorc.Writer(buff, orc_schema) as writer:
-        writer.write(data)
+    orc.write_table(pa_table, buff)
 
     got = cudf.io.orc.read_orc_statistics([buff])
 
@@ -1615,6 +1584,8 @@ def test_select_nested(list_struct_buff, equivalent_columns):
 
 
 def test_orc_writer_rle_stream_size(datadir, tmpdir):
+    from pyarrow import orc
+
     original = datadir / "TestOrcFile.int16.rle.size.orc"
     reencoded = tmpdir.join("int16_map.orc")
 
@@ -1622,7 +1593,7 @@ def test_orc_writer_rle_stream_size(datadir, tmpdir):
     df.to_orc(reencoded)
 
     # Segfaults when RLE stream sizes don't account for varint length
-    pa_out = pa.orc.ORCFile(reencoded).read()
+    pa_out = orc.ORCFile(reencoded).read()
     assert df.to_arrow().equals(pa_out)
 
 
@@ -1642,11 +1613,13 @@ def test_empty_columns():
 
 
 def test_orc_reader_zstd_compression(list_struct_buff):
+    from pyarrow import orc
+
     expected = cudf.read_orc(list_struct_buff)
     # save with ZSTD compression
     buffer = BytesIO()
-    pyarrow_tbl = pyarrow.orc.ORCFile(list_struct_buff).read()
-    writer = pyarrow.orc.ORCWriter(buffer, compression="zstd")
+    pyarrow_tbl = orc.ORCFile(list_struct_buff).read()
+    writer = orc.ORCWriter(buffer, compression="zstd")
     writer.write(pyarrow_tbl)
     writer.close()
     try:
@@ -1845,10 +1818,7 @@ def negative_timestamp_df():
 @pytest.mark.parametrize("engine", ["cudf", "pyarrow"])
 def test_orc_reader_negative_timestamp(negative_timestamp_df, engine):
     buffer = BytesIO()
-    pyorc_table = pa.Table.from_pandas(
-        negative_timestamp_df.to_pandas(), preserve_index=False
-    )
-    pyarrow.orc.write_table(pyorc_table, buffer)
+    negative_timestamp_df.to_orc(buffer)
 
     # We warn the user that this function will fall back to the CPU for reading
     # when the engine is pyarrow.
@@ -1859,11 +1829,13 @@ def test_orc_reader_negative_timestamp(negative_timestamp_df, engine):
 
 
 def test_orc_writer_negative_timestamp(negative_timestamp_df):
+    from pyarrow import orc
+
     buffer = BytesIO()
     negative_timestamp_df.to_orc(buffer)
 
     assert_eq(negative_timestamp_df, pd.read_orc(buffer))
-    assert_eq(negative_timestamp_df, pyarrow.orc.ORCFile(buffer).read())
+    assert_eq(negative_timestamp_df, orc.ORCFile(buffer).read())
 
 
 def test_orc_reader_apache_negative_timestamp(datadir):
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 39a8dca0267..90759074750 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -58,7 +58,6 @@ test = [
     "hypothesis",
     "mimesis>=4.1.0",
     "msgpack",
-    "pyorc",
     "pytest",
     "pytest-benchmark",
     "pytest-cases",