Skip to content

Commit

Permalink
Avoid pyarrow.fs import for local storage (#14321)
Browse files Browse the repository at this point in the history
This is not a resolution, but may help mitigate problems from aws/aws-sdk-cpp#2681

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Lawrence Mitchell (https://github.com/wence-)
  - Bradley Dice (https://github.com/bdice)

URL: #14321
  • Loading branch information
rjzamora authored Oct 24, 2023
1 parent b390bca commit 19d791c
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 4 deletions.
5 changes: 4 additions & 1 deletion python/cudf/cudf/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import pyarrow as pa
from fsspec.utils import stringify_path
from pyarrow import orc as orc

import cudf
from cudf._lib import orc as liborc
Expand All @@ -17,6 +16,8 @@


def _make_empty_df(filepath_or_buffer, columns):
from pyarrow import orc

orc_file = orc.ORCFile(filepath_or_buffer)
schema = orc_file.schema
col_names = schema.names if columns is None else columns
Expand Down Expand Up @@ -150,6 +151,7 @@ def _parse_column_statistics(cs, column_statistics_blob):
@ioutils.doc_read_orc_metadata()
def read_orc_metadata(path):
"""{docstring}"""
from pyarrow import orc

orc_file = orc.ORCFile(path)

Expand Down Expand Up @@ -380,6 +382,7 @@ def read_orc(
)
)
else:
from pyarrow import orc

def read_orc_stripe(orc_file, stripe, columns):
pa_table = orc_file.read_stripe(stripe, columns)
Expand Down
11 changes: 9 additions & 2 deletions python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

import numpy as np
import pandas as pd
from pyarrow import dataset as ds, parquet as pq
from pyarrow import dataset as ds

import cudf
from cudf._lib import parquet as libparquet
Expand Down Expand Up @@ -266,6 +266,7 @@ def write_to_dataset(
@_cudf_nvtx_annotate
def read_parquet_metadata(path):
"""{docstring}"""
import pyarrow.parquet as pq

pq_file = pq.ParquetFile(path)

Expand Down Expand Up @@ -303,7 +304,9 @@ def _process_dataset(

# Convert filters to ds.Expression
if filters is not None:
filters = pq.filters_to_expression(filters)
from pyarrow.parquet import filters_to_expression

filters = filters_to_expression(filters)

# Initialize ds.FilesystemDataset
# TODO: Remove the if len(paths) workaround after following bug is fixed:
Expand Down Expand Up @@ -825,6 +828,8 @@ def _read_parquet(
use_pandas_metadata=use_pandas_metadata,
)
else:
import pyarrow.parquet as pq

return cudf.DataFrame.from_arrow(
pq.ParquetDataset(filepaths_or_buffers).read_pandas(
columns=columns, *args, **kwargs
Expand Down Expand Up @@ -930,6 +935,8 @@ def to_parquet(
)

else:
import pyarrow.parquet as pq

if partition_offsets is not None:
warnings.warn(
"partition_offsets will be ignored when engine is not cudf"
Expand Down
14 changes: 14 additions & 0 deletions python/cudf/cudf/tests/test_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,3 +533,17 @@ def test_write_chunked_parquet(s3_base, s3so):
actual.sort_values(["b"]).reset_index(drop=True),
cudf.concat([df1, df2]).sort_values(["b"]).reset_index(drop=True),
)


def test_no_s3fs_on_cudf_import():
import subprocess
import sys

output = subprocess.check_output(
[
sys.executable,
"-c",
"import cudf; import sys; print('pyarrow._s3fs' in sys.modules)",
]
)
assert output.strip() == b"False"
10 changes: 9 additions & 1 deletion python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import pandas as pd
from fsspec.core import get_fs_token_paths
from pyarrow import PythonFile as ArrowPythonFile
from pyarrow.fs import FSSpecHandler, PyFileSystem
from pyarrow.lib import NativeFile

from cudf.utils.docutils import docfmt_partial
Expand Down Expand Up @@ -1630,6 +1629,15 @@ def _open_remote_files(
for path, rgs in zip(paths, row_groups)
]

# Avoid top-level pyarrow.fs import.
# Importing pyarrow.fs initializes a S3 SDK with a finalizer
# that runs atexit. In some circumstances it appears this
# runs a call into a logging system that is already shutdown.
# To avoid this, we only import this subsystem if it is
# really needed.
# See https://github.com/aws/aws-sdk-cpp/issues/2681
from pyarrow.fs import FSSpecHandler, PyFileSystem

# Default open - Use pyarrow filesystem API
pa_fs = PyFileSystem(FSSpecHandler(fs))
return [
Expand Down

0 comments on commit 19d791c

Please sign in to comment.