remove custom arrow code path in favor of proper dask-cudf support

rapidsai · Aug 28, 2024 · ad8df90 · ad8df90
1 parent 4aa53be
commit ad8df90
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 192 deletions.
diff --git a/dask_cuda/benchmarks/custom/__init__.py b/dask_cuda/benchmarks/custom/__init__.py
diff --git a/dask_cuda/benchmarks/custom/parquet.py b/dask_cuda/benchmarks/custom/parquet.py
diff --git a/dask_cuda/benchmarks/remote_parquet.py b/dask_cuda/benchmarks/remote_parquet.py
@@ -32,29 +32,22 @@ def read_data(
     path = DEFAULT_DATASET_PATH
     columns = DEFAULT_COLUMNS
     with dask.config.set({"dataframe.backend": backend}):
-        if filesystem == "arrow" and backend == "cudf":
-            df = custom_read_parquet(
-                path,
-                columns=columns,
-                blocksize=blocksize,
-            )
+        if filesystem == "arrow":
+            # TODO: Warn user that blocksize and aggregate_files
+            # are ingored when `filesystem == "arrow"`
+            _blocksize = {}
+            _aggregate_files = {}
         else:
-            if filesystem == "arrow":
-                # TODO: Warn user that blocksize and aggregate_files
-                # are ingored when `filesystem == "arrow"`
-                _blocksize = {}
-                _aggregate_files = {}
-            else:
-                _blocksize = {"blocksize": blocksize}
-                _aggregate_files = {"aggregate_files": aggregate_files}
-
-            df = dd.read_parquet(
-                path,
-                columns=columns,
-                filesystem=filesystem,
-                **_blocksize,
-                **_aggregate_files,
-            )
+            _blocksize = {"blocksize": blocksize}
+            _aggregate_files = {"aggregate_files": aggregate_files}
+
+        df = dd.read_parquet(
+            path,
+            columns=columns,
+            filesystem=filesystem,
+            **_blocksize,
+            **_aggregate_files,
+        )
         return df.memory_usage().compute().sum()