Skip to content

Commit

Permalink
TST (string dtype): resolve all xfails in IO parser tests (#60321)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche authored Nov 15, 2024
1 parent fba5f08 commit ee3c18f
Show file tree
Hide file tree
Showing 11 changed files with 49 additions and 63 deletions.
13 changes: 7 additions & 6 deletions pandas/tests/io/parser/common/test_chunksize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs import parsers as libparsers
from pandas.errors import DtypeWarning

Expand Down Expand Up @@ -231,8 +229,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
assert result.a.dtype == float


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_warn_if_chunks_have_mismatched_type(all_parsers):
def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string):
warning_type = None
parser = all_parsers
size = 10000
Expand Down Expand Up @@ -260,8 +257,12 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):
"Specify dtype option on import or set low_memory=False.",
buf,
)

assert df.a.dtype == object
if parser.engine == "c" and parser.low_memory:
assert df.a.dtype == object
elif using_infer_string:
assert df.a.dtype == "str"
else:
assert df.a.dtype == object


@pytest.mark.parametrize("iterator", [True, False])
Expand Down
7 changes: 2 additions & 5 deletions pandas/tests/io/parser/common/test_file_buffer_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import WASM
from pandas.errors import (
EmptyDataError,
Expand Down Expand Up @@ -71,14 +69,13 @@ def test_local_file(all_parsers, csv_dir_path):
pytest.skip("Failing on: " + " ".join(platform.uname()))


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@xfail_pyarrow # AssertionError: DataFrame.index are different
def test_path_path_lib(all_parsers):
parser = all_parsers
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
tm.assert_frame_equal(df, result)
Expand Down
10 changes: 6 additions & 4 deletions pandas/tests/io/parser/common/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@

import pytest

from pandas._config import using_string_dtype

from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -88,9 +86,13 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
def test_multi_index_no_level_names(all_parsers, index_col):
def test_multi_index_no_level_names(
request, all_parsers, index_col, using_infer_string
):
if using_infer_string and all_parsers.engine == "pyarrow":
# result should have string columns instead of object dtype
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
data = """index1,index2,A,B,C,D
foo,one,2,3,4,5
foo,two,7,8,9,10
Expand Down
4 changes: 0 additions & 4 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import ParserWarning

import pandas as pd
Expand Down Expand Up @@ -57,7 +55,6 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_per_column(all_parsers):
parser = all_parsers
Expand All @@ -71,7 +68,6 @@ def test_dtype_per_column(all_parsers):
[[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
)
expected["one"] = expected["one"].astype(np.float64)
expected["two"] = expected["two"].astype(object)

result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
tm.assert_frame_equal(result, expected)
Expand Down
13 changes: 7 additions & 6 deletions pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import WASM
from pandas.compat.numpy import np_version_gte1p24
from pandas.errors import (
Expand Down Expand Up @@ -184,8 +182,7 @@ def error(val: float, actual_val: Decimal) -> Decimal:
assert max(precise_errors) <= max(normal_errors)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_usecols_dtypes(c_parser_only):
def test_usecols_dtypes(c_parser_only, using_infer_string):
parser = c_parser_only
data = """\
1,2,3
Expand All @@ -210,8 +207,12 @@ def test_usecols_dtypes(c_parser_only):
dtype={"b": int, "c": float},
)

assert (result.dtypes == [object, int, float]).all()
assert (result2.dtypes == [object, float]).all()
if using_infer_string:
assert (result.dtypes == ["string", int, float]).all()
assert (result2.dtypes == ["string", float]).all()
else:
assert (result.dtypes == [object, int, float]).all()
assert (result2.dtypes == [object, float]).all()


def test_disable_bool_parsing(c_parser_only):
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/io/parser/test_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -188,7 +186,6 @@ def convert_score(x):
tm.assert_frame_equal(results[0], results[1])


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("conv_f", [lambda x: x, str])
def test_converter_index_col_bug(all_parsers, conv_f):
# see gh-1835 , GH#40589
Expand All @@ -207,7 +204,7 @@ def test_converter_index_col_bug(all_parsers, conv_f):
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
)

xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object"))
xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A"))
tm.assert_frame_equal(rs, xp)


Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/io/parser/test_index_col.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -345,7 +343,6 @@ def test_infer_types_boolean_sum(all_parsers):
tm.assert_frame_equal(result, expected, check_index_type=False)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)])
def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
# GH#9435
Expand All @@ -356,7 +353,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine")
)
result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype})
expected = DataFrame({"b": [2]}, index=Index([val], name="a"))
expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype))
tm.assert_frame_equal(result, expected)


Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/io/parser/test_mangle_dupes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@

import pytest

from pandas._config import using_string_dtype

from pandas import DataFrame
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm

xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
Expand Down Expand Up @@ -121,7 +122,6 @@ def test_thorough_mangle_names(all_parsers, data, names, expected):
parser.read_csv(StringIO(data), names=names)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@xfail_pyarrow # AssertionError: DataFrame.columns are different
def test_mangled_unnamed_placeholders(all_parsers):
# xref gh-13017
Expand All @@ -133,7 +133,7 @@ def test_mangled_unnamed_placeholders(all_parsers):

# This test recursively updates `df`.
for i in range(3):
expected = DataFrame()
expected = DataFrame(columns=Index([], dtype="str"))

for j in range(i + 1):
col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)
Expand Down
31 changes: 17 additions & 14 deletions pandas/tests/io/parser/test_na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs.parsers import STR_NA_VALUES

from pandas import (
Expand Down Expand Up @@ -261,7 +259,6 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"kwargs,expected",
[
Expand Down Expand Up @@ -299,7 +296,9 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected):
),
],
)
def test_na_values_keep_default(all_parsers, kwargs, expected, request):
def test_na_values_keep_default(
all_parsers, kwargs, expected, request, using_infer_string
):
data = """\
A,B,C
a,1,one
Expand All @@ -317,8 +316,9 @@ def test_na_values_keep_default(all_parsers, kwargs, expected, request):
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
return
mark = pytest.mark.xfail()
request.applymarker(mark)
if not using_infer_string or "na_values" in kwargs:
mark = pytest.mark.xfail()
request.applymarker(mark)

result = parser.read_csv(StringIO(data), **kwargs)
expected = DataFrame(expected)
Expand Down Expand Up @@ -429,23 +429,28 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case
@pytest.mark.parametrize(
"na_filter,row_data",
[
(True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
(False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
],
)
def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
def test_na_values_na_filter_override(
request, all_parsers, na_filter, row_data, using_infer_string
):
parser = all_parsers
if parser.engine == "pyarrow":
# mismatched dtypes in both cases, FutureWarning in the True case
if not (using_infer_string and na_filter):
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
request.applymarker(mark)
data = """\
A,B
1,A
nan,B
3,C
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter)

expected = DataFrame(row_data, columns=["A", "B"])
Expand Down Expand Up @@ -536,7 +541,6 @@ def test_na_values_dict_aliasing(all_parsers):
tm.assert_dict_equal(na_values, na_values_copy)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_na_values_dict_null_column_name(all_parsers):
# see gh-57547
parser = all_parsers
Expand All @@ -560,11 +564,10 @@ def test_na_values_dict_null_column_name(all_parsers):
return

expected = DataFrame(
{None: ["MA", "NA", "OA"], "x": [1.0, 2.0, np.nan], "y": [2.0, 1.0, 3.0]}
{"x": [1.0, 2.0, np.nan], "y": [2.0, 1.0, 3.0]},
index=Index(["MA", "NA", "OA"], dtype=object),
)

expected = expected.set_index(None)

result = parser.read_csv(
StringIO(data),
index_col=0,
Expand Down
11 changes: 3 additions & 8 deletions pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -421,15 +419,14 @@ def test_parse_timezone(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@skip_pyarrow # pandas.errors.ParserError: CSV parse error
@pytest.mark.parametrize(
"date_string",
["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"],
)
def test_invalid_parse_delimited_date(all_parsers, date_string):
parser = all_parsers
expected = DataFrame({0: [date_string]}, dtype="object")
expected = DataFrame({0: [date_string]}, dtype="str")
result = parser.read_csv(
StringIO(date_string),
header=None,
Expand Down Expand Up @@ -609,7 +606,6 @@ def test_date_parser_usecols_thousands(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_dayfirst_warnings():
# GH 12585

Expand Down Expand Up @@ -642,7 +638,7 @@ def test_dayfirst_warnings():

# first in DD/MM/YYYY, second in MM/DD/YYYY
input = "date\n31/12/2014\n03/30/2011"
expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date")
expected = Index(["31/12/2014", "03/30/2011"], dtype="str", name="date")

# A. use dayfirst=True
res5 = read_csv(
Expand Down Expand Up @@ -752,7 +748,6 @@ def test_parse_dates_and_string_dtype(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_parse_dot_separated_dates(all_parsers):
# https://github.com/pandas-dev/pandas/issues/2586
parser = all_parsers
Expand All @@ -762,7 +757,7 @@ def test_parse_dot_separated_dates(all_parsers):
if parser.engine == "pyarrow":
expected_index = Index(
["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"],
dtype="object",
dtype="str",
name="a",
)
warn = None
Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/io/parser/test_upcast.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs.parsers import (
_maybe_upcast,
na_values,
Expand Down Expand Up @@ -86,7 +84,6 @@ def test_maybe_upcaste_all_nan():
tm.assert_extension_array_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("val", [na_values[np.object_], "c"])
def test_maybe_upcast_object(val, string_storage):
# GH#36712
Expand Down

0 comments on commit ee3c18f

Please sign in to comment.