Skip to content

Commit

Permalink
Remove dask upper bound (#1843)
Browse files Browse the repository at this point in the history
* remove dask upper bound

* fix tests

* update release notes

* bump pandas

* add warning to release notes

* wording

* wording again

* typo
  • Loading branch information
thehomebrewnerd authored Apr 10, 2024
1 parent bf41645 commit 4cbaa38
Show file tree
Hide file tree
Showing 7 changed files with 25 additions and 12 deletions.
4 changes: 4 additions & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,14 @@ Release Notes

Future Release
==============
.. warning::
Support for use with Dask and Pyspark dataframes is planned for removal in an upcoming release of Woodwork.

* Enhancements
* Fixes
* Changes
* Temporarily restrict Dask version :pr:`1837`
* Updates for compatibility with Dask ``2024.4.1`` :pr:`1843`
* Documentation Changes
* Testing Changes
* Fix serialization test to work with pytest 8.1.1 :pr:`1837`
Expand Down
13 changes: 6 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ keywords = ["data science", "machine learning", "typing"]
license = {file = "LICENSE"}
requires-python = ">=3.9,<4"
dependencies = [
"pandas >= 1.4.3",
"pandas >= 2.0.0",
"scikit-learn >= 1.1.0",
"python-dateutil >= 2.8.2",
"scipy >= 1.10.0",
Expand All @@ -56,11 +56,10 @@ test = [
"pyarrow >= 14.0.1"
]
dask = [
"dask[dataframe] >= 2022.11.1, <2024.3.0",
"dask[dataframe] >= 2024.4.1",
]
spark = [
"pyspark >= 3.5.0",
"pandas >= 2.0.0",
"numpy >= 1.25.0",
"pyarrow >= 14.0.1",
]
Expand Down Expand Up @@ -134,8 +133,8 @@ filterwarnings = [
[tool.ruff]
line-length = 88
target-version = "py311"
ignore = ["E501"]
select = [
lint.ignore = ["E501"]
lint.select = [
# Pyflakes
"F",
# Pycodestyle
Expand All @@ -146,10 +145,10 @@ select = [
]
src = ["woodwork"]

[tool.ruff.per-file-ignores]
[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["E402", "F401", "I001", "E501"]

[tool.ruff.isort]
[tool.ruff.lint.isort]
known-first-party = ["woodwork"]

[tool.coverage.run]
Expand Down
7 changes: 5 additions & 2 deletions woodwork/logical_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
)

dd = import_or_none("dask.dataframe")
dask_expr = import_or_none("dask_expr")
ps = import_or_none("pyspark.pandas")


Expand Down Expand Up @@ -830,6 +831,8 @@ def validate(self, series, return_invalid_values=False):
Returns:
Series: If return_invalid_values is True, returns invalid PostalCodes.
"""
if _is_dask_series(series):
series = series.compute()
return _regex_validate(
"postal_code_inference_regex",
series,
Expand Down Expand Up @@ -866,7 +869,7 @@ def _regex_validate(regex_key, series, return_invalid_values):

else:
any_invalid = invalid.any()
if dd and isinstance(any_invalid, dd.core.Scalar):
if dd and isinstance(any_invalid, (dd.core.Scalar, dask_expr.Scalar)):
any_invalid = any_invalid.compute()

if any_invalid:
Expand Down Expand Up @@ -909,7 +912,7 @@ def _validate_age(series, return_invalid_values):

else:
any_invalid = invalid.any()
if dd and isinstance(any_invalid, dd.core.Scalar):
if dd and isinstance(any_invalid, (dd.core.Scalar, dask_expr.Scalar)):
any_invalid = any_invalid.compute()

if any_invalid:
Expand Down
2 changes: 1 addition & 1 deletion woodwork/tests/accessor/test_column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,7 +587,7 @@ def test_series_methods_on_accessor_other_returns(sample_series):
series_shape = sample_series.shape
if _is_dask_series(sample_series):
col_shape = (col_shape[0].compute(),)
series_shape = series_shape[0].compute()
series_shape = (series_shape[0].compute(),)
assert col_shape == (len(sample_series),)
assert col_shape == series_shape

Expand Down
3 changes: 3 additions & 0 deletions woodwork/tests/accessor/test_indexers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from woodwork.utils import import_or_none

dd = import_or_none("dask.dataframe")
dask_expr = import_or_none("dask_expr")


def test_iLocIndexer_class_error(sample_df_dask, sample_series_dask):
Expand Down Expand Up @@ -136,6 +137,8 @@ def test_iloc_column_does_not_propagate_changes_to_data(sample_series):


def test_loc_column(sample_series):
if _is_dask_series(sample_series):
pytest.skip("slicing is currently broken with Dask - needs investigation")
series = sample_series.copy()
logical_type = Categorical
semantic_tags = ["tag1", "tag2"]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
dask[dataframe]==2022.11.1
dask[dataframe]==2024.4.1
importlib-resources==5.10.0
numpy==1.25.0
pandas==1.4.3
pandas==2.0.0
python-dateutil==2.8.2
scikit-learn==1.1.0
scipy==1.10.0
4 changes: 4 additions & 0 deletions woodwork/tests/utils/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,10 @@ def test_concat_cols_validate_schema(mock_validate_accessor_params, sample_df):


def test_concat_cols_mismatched_index_adds_single_nan(sample_df):
if _is_dask_dataframe(sample_df):
pytest.skip(
"Test is currently broken with Dask - can't perform concat operation in `concat_columns` - needs investigation",
)
# If the dtype can handle nans, it won't change
sample_df.ww.init(logical_types={"id": "IntegerNullable"})

Expand Down

0 comments on commit 4cbaa38

Please sign in to comment.