Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Fix gpu cudf to_csv #799

Merged
merged 14 commits into from
Aug 21, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@
import pandas as pd

from ....core import tile
from ....tests.core import support_cuda
from ... import DataFrame


def test_to_csv():
@support_cuda
def test_to_csv(setup_gpu, gpu):
raw = pd.DataFrame(np.random.rand(10, 5))
df = DataFrame(raw, chunk_size=4)
df = DataFrame(raw, gpu=gpu, chunk_size=4)

r = df.to_csv("*.csv")
r = tile(r)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,13 @@
fastparquet = None

from .... import dataframe as md
from ....tests.core import flaky
from ....tests.core import flaky, support_cuda
from ... import DataFrame
from ...utils import PD_VERSION_GREATER_THAN_2_10


def test_to_csv_execution(setup):
@support_cuda
def test_to_csv_execution(setup, setup_gpu, gpu):
index = pd.RangeIndex(100, 0, -1, name="index")
raw = pd.DataFrame(
{
Expand All @@ -53,7 +54,7 @@ def test_to_csv_execution(setup):
},
index=index,
)
df = DataFrame(raw, chunk_size=33)
df = DataFrame(raw, gpu=gpu, chunk_size=33)

with tempfile.TemporaryDirectory() as base_path:
# DATAFRAME TESTS
Expand Down Expand Up @@ -82,7 +83,7 @@ def test_to_csv_execution(setup):
pd.testing.assert_frame_equal(dfs[1].set_index("index"), raw.iloc[33:66])

# test df with unknown shape
df2 = DataFrame(raw, chunk_size=(50, 2))
df2 = DataFrame(raw, gpu=gpu, chunk_size=(50, 2))
df2 = df2[df2["col1"] < 1]
path2 = os.path.join(base_path, "out2.csv")
df2.to_csv(path2).execute()
Expand All @@ -92,32 +93,34 @@ def test_to_csv_execution(setup):
pd.testing.assert_frame_equal(result, raw)

# SERIES TESTS
series = md.Series(raw.col1, chunk_size=33)

# test one file with series
path = os.path.join(base_path, "out.csv")
series.to_csv(path).execute()

result = pd.read_csv(path, dtype=raw.dtypes.to_dict())
result.set_index("index", inplace=True)
pd.testing.assert_frame_equal(result, raw.col1.to_frame())

# test multi files with series
path = os.path.join(base_path, "out-*.csv")
series.to_csv(path).execute()

dfs = [
pd.read_csv(
os.path.join(base_path, f"out-{i}.csv"), dtype=raw.dtypes.to_dict()
# cudf series not support to_csv
if gpu == False:
series = md.Series(raw.col1, chunk_size=33)

# test one file with series
path = os.path.join(base_path, "out.csv")
series.to_csv(path).execute()

result = pd.read_csv(path, dtype=raw.dtypes.to_dict())
result.set_index("index", inplace=True)
pd.testing.assert_frame_equal(result, raw.col1.to_frame())

# test multi files with series
path = os.path.join(base_path, "out-*.csv")
series.to_csv(path).execute()

dfs = [
pd.read_csv(
os.path.join(base_path, f"out-{i}.csv"), dtype=raw.dtypes.to_dict()
)
for i in range(4)
]
result = pd.concat(dfs, axis=0)
result.set_index("index", inplace=True)
pd.testing.assert_frame_equal(result, raw.col1.to_frame())
pd.testing.assert_frame_equal(
dfs[1].set_index("index"), raw.col1.to_frame().iloc[33:66]
)
for i in range(4)
]
result = pd.concat(dfs, axis=0)
result.set_index("index", inplace=True)
pd.testing.assert_frame_equal(result, raw.col1.to_frame())
pd.testing.assert_frame_equal(
dfs[1].set_index("index"), raw.col1.to_frame().iloc[33:66]
)


@pytest.mark.skipif(sqlalchemy is None, reason="sqlalchemy not installed")
Expand Down
15 changes: 14 additions & 1 deletion python/xorbits/_mars/dataframe/datastore/to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from ...tensor.core import TensorOrder
from ...tensor.operands import TensorOperand, TensorOperandMixin
from ..operands import DataFrameOperand, DataFrameOperandMixin
from ..utils import is_pandas_2, parse_index
from ..utils import is_cudf, is_pandas_2, parse_index


class DataFrameToCSV(DataFrameOperand, DataFrameOperandMixin):
Expand Down Expand Up @@ -374,6 +374,19 @@ def _to_csv(cls, op, df, path, header=None):
kwargs["line_terminator"] = op.lineterminator
kwargs.pop("lineterminator")

# cudf not support following parameters
if is_cudf(df):
kwargs.pop("float_format")
kwargs.pop("index_label")
kwargs.pop("mode")
kwargs.pop("quoting")
kwargs.pop("quotechar")
kwargs.pop("date_format")
kwargs.pop("doublequote")
kwargs.pop("escapechar")
kwargs.pop("decimal")
kwargs["compression"] = None

df.to_csv(path, **kwargs)

@classmethod
Expand Down
Loading