From 98e473532e974ffa58ad96a85b0c2bff6b7e6b7a Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Fri, 2 Feb 2024 17:16:56 -0500 Subject: [PATCH] Update numpy to latest (#1799) * initial commit * bump pandas min 1.5.0 * testing * testing * update numpy * update numpy * update numpy * update numpy * loosen numpy * loosen numpy * remove python 3.8 * remove python 3.8 * revert read file * lint fix * Updated release notes and pinned numpy under 2.0.0 * incorrect pr num * update minimum requirements * update minimum dask * min spark version * set min scikit-learn for min spark * first pass fix doc build * second pass build docs * Add line ending Added line ending to file * Add line to end of release notes * Missing blank line In release notes * Revert "Merge remote-tracking branch 'origin/integrate_string_arrow' into update_numpy" This reverts commit dc4ba5bffd69d09856cded631177341dcc8cadef, reversing changes made to f59074b29259eb1c9f75b19f51e18544998cbba2. * Doc updates, tests pass All tests pass with upgraded libs * update min pyarrow in test reqs Update the minimum pyarrow package in the test requirements * moto.mock_s3 -> moto.mock_aws name change in library * Updated min req for moto * boto3 min updated moto upgrade requires an upgrade to boto3 * parquet - try forcing INT96 timestamp Workaround for the Minimum Dependencies (Spark) test * Remove temp parquet file Used for local manual test and slipped through * Updates per PR review * Modified min requirements Based on running action against branch * Missing = in requirements dumb * Incorrect scikit-learn version s/b 0.22 not 0.2.2 * Min scikit-learn 1.1.0 * spark requires python-dateutil 2.8.2 * pyspark min 3.5.0 to pass tests * "revert" cast in _get_histogram_values.py Not clear why this was necessary. Tests pass without it * _get_histogram_values cast re-added With an updated filter --------- Co-authored-by: Parthiv Naresh Co-authored-by: Christopher Park --- .github/workflows/build_docs.yaml | 10 ++++---- .github/workflows/install_test.yaml | 6 ++--- .../workflows/latest_dependency_checker.yaml | 4 ++-- .github/workflows/tests_with_latest_deps.yaml | 20 ++++++++-------- .../workflows/tests_with_minimum_deps.yaml | 6 ++--- ...odwork_main_airflow_performance_tests.yaml | 4 ++-- .pre-commit-config.yaml | 2 +- .readthedocs.yaml | 2 +- .../using_woodwork_with_dask_and_spark.ipynb | 5 +++- docs/source/install.md | 12 +++++----- docs/source/release_notes.rst | 14 +++++++++++ pyproject.toml | 24 +++++++++---------- woodwork/serializers/parquet_serializer.py | 2 +- .../statistics_utils/_get_histogram_values.py | 11 ++++++++- .../tests/accessor/test_column_accessor.py | 4 +++- woodwork/tests/accessor/test_serialization.py | 18 ++++++++++---- woodwork/tests/accessor/test_statistics.py | 2 +- woodwork/tests/conftest.py | 16 +++++++++---- .../minimum_core_requirements.txt | 4 ++-- .../minimum_dask_requirements.txt | 5 ++-- .../minimum_spark_requirements.txt | 12 +++++----- .../minimum_test_requirements.txt | 6 ++--- woodwork/tests/testing_utils/__init__.py | 4 +++- woodwork/tests/testing_utils/table_utils.py | 13 +++++++++- .../tests/type_system/test_ltype_inference.py | 2 +- 25 files changed, 133 insertions(+), 75 deletions(-) diff --git a/.github/workflows/build_docs.yaml b/.github/workflows/build_docs.yaml index 74be2f121..2c71edc37 100644 --- a/.github/workflows/build_docs.yaml +++ b/.github/workflows/build_docs.yaml @@ -11,11 +11,11 @@ env: ALTERYX_OPEN_SRC_UPDATE_CHECKER: False jobs: build_docs: - name: 3.8 build docs + name: 3.9 build docs runs-on: ubuntu-latest strategy: matrix: - python_version: ["3.8"] + python_version: ["3.9"] steps: - name: Checkout repository uses: actions/checkout@v3 @@ -26,12 +26,12 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python_version }} - cache: 'pip' + cache: 'pip' cache-dependency-path: 'pyproject.toml' - uses: actions/cache@v3 id: cache with: - path: ${{ env.pythonLocation }} + path: ${{ env.pythonLocation }} key: ${{ matrix.python_version }}-lint-${{ env.pythonLocation }}-${{ hashFiles('**/pyproject.toml') }}-v01 - name: Install apt requirements run: | @@ -42,7 +42,7 @@ jobs: - name: Install woodwork with doc dependencies (not using cache) if: steps.cache.outputs.cache-hit != 'true' run: | - python -m pip install .[dev] + python -m pip install ".[docs]" - name: Install woodwork with no doc dependencies (using cache) if: steps.cache.outputs.cache-hit == 'true' run: | diff --git a/.github/workflows/install_test.yaml b/.github/workflows/install_test.yaml index 1ccf163da..09aad7a22 100644 --- a/.github/workflows/install_test.yaml +++ b/.github/workflows/install_test.yaml @@ -14,7 +14,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, macos-latest] - python_version: ["3.8", "3.9", "3.10", "3.11"] + python_version: ["3.9", "3.10", "3.11"] runs-on: ${{ matrix.os }} steps: - name: Checkout repository @@ -26,12 +26,12 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python_version }} - cache: 'pip' + cache: 'pip' cache-dependency-path: 'pyproject.toml' - uses: actions/cache@v3 id: cache with: - path: ${{ env.pythonLocation }} + path: ${{ env.pythonLocation }} key: ${{ matrix.os- }}-${{ matrix.python_version }}-install-${{ env.pythonLocation }}-${{ hashFiles('**/pyproject.toml') }}-v01 - name: Build woodwork package run: | diff --git a/.github/workflows/latest_dependency_checker.yaml b/.github/workflows/latest_dependency_checker.yaml index d9698e2b0..c63accc32 100644 --- a/.github/workflows/latest_dependency_checker.yaml +++ b/.github/workflows/latest_dependency_checker.yaml @@ -12,10 +12,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v4 with: - python-version: '3.8.x' + python-version: '3.9.x' - name: Install pip and virtualenv run: | python -m pip install --upgrade pip diff --git a/.github/workflows/tests_with_latest_deps.yaml b/.github/workflows/tests_with_latest_deps.yaml index 83b826319..8f21621eb 100644 --- a/.github/workflows/tests_with_latest_deps.yaml +++ b/.github/workflows/tests_with_latest_deps.yaml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: true matrix: - python_version: ["3.8", "3.9", "3.10", "3.11"] + python_version: ["3.9", "3.10", "3.11"] directories: ["All Other Tests", "Testing Table Accessor", "Testing to Disk with LatLong", "All other Serialization"] steps: - name: Set up python ${{ matrix.python_version }} @@ -49,47 +49,47 @@ jobs: python -m pip install unpacked_sdist/[dask] cd unpacked_sdist coverage erase - - if: ${{ matrix.python_version != 3.8 && matrix.directories == 'Testing to Disk with LatLong' }} + - if: ${{ matrix.python_version != 3.9 && matrix.directories == 'Testing to Disk with LatLong' }} name: Run testing to Disk with LatLong Unit Tests (no code coverage) run: | cd unpacked_sdist pytest woodwork/tests/accessor/test_serialization.py::test_to_disk_with_latlong -n 2 --durations 0 - - if: ${{ matrix.python_version != 3.8 && matrix.directories == 'All other Serialization' }} + - if: ${{ matrix.python_version != 3.9 && matrix.directories == 'All other Serialization' }} name: Run all other Serialization Unit Tests (no code coverage) run: | cd unpacked_sdist pytest woodwork/tests/accessor/test_serialization.py --ignore=woodwork/tests/accessor/test_serialization.py::test_to_disk_with_latlong -n 2 --durations 0 - - if: ${{ matrix.python_version != 3.8 && matrix.directories == 'Testing Table Accessor' }} + - if: ${{ matrix.python_version != 3.9 && matrix.directories == 'Testing Table Accessor' }} name: Run Table Accessor Unit Tests (no code coverage) run: | cd unpacked_sdist pytest woodwork/tests/accessor/test_table_accessor.py -n 2 --durations 0 - - if: ${{ matrix.python_version != 3.8 && matrix.directories == 'All Other Tests' }} + - if: ${{ matrix.python_version != 3.9 && matrix.directories == 'All Other Tests' }} name: Run all other Unit Tests (no code coverage) run: | cd unpacked_sdist pytest woodwork/ -n 2 --ignore=woodwork/tests/accessor/test_serialization.py --ignore=woodwork/tests/accessor/test_table_accessor.py --durations 0 - - if: ${{ matrix.python_version == 3.8 && matrix.directories == 'Testing to Disk with LatLong' }} + - if: ${{ matrix.python_version == 3.9 && matrix.directories == 'Testing to Disk with LatLong' }} name: Run Testing to Disk with LatLong Unit Tests with code coverage run: | cd unpacked_sdist pytest woodwork/tests/accessor/test_serialization.py::test_to_disk_with_latlong -n 2 --durations 0 --cov=woodwork --cov-config=../pyproject.toml --cov-report=xml:../coverage.xml - - if: ${{ matrix.python_version == 3.8 && matrix.directories == 'All other Serialization' }} + - if: ${{ matrix.python_version == 3.9 && matrix.directories == 'All other Serialization' }} name: Run all other Serialization Unit Tests with code coverage run: | cd unpacked_sdist pytest woodwork/tests/accessor/test_serialization.py --ignore=woodwork/tests/accessor/test_serialization.py::test_to_disk_with_latlong -n 2 --durations 0 --cov=woodwork --cov-config=../pyproject.toml --cov-report=xml:../coverage.xml - - if: ${{ matrix.python_version == 3.8 && matrix.directories == 'Testing Table Accessor' }} + - if: ${{ matrix.python_version == 3.9 && matrix.directories == 'Testing Table Accessor' }} name: Run Table Accessor Unit Tests with code coverage run: | cd unpacked_sdist pytest woodwork/tests/accessor/test_table_accessor.py -n 2 --durations 0 --cov=woodwork --cov-config=../pyproject.toml --cov-report=xml:../coverage.xml - - if: ${{ matrix.python_version == 3.8 && matrix.directories == 'All Other Tests' }} + - if: ${{ matrix.python_version == 3.9 && matrix.directories == 'All Other Tests' }} name: Run all other Unit Tests with code coverage run: | cd unpacked_sdist pytest woodwork/ -n 2 --ignore=woodwork/tests/accessor/test_serialization.py --ignore=woodwork/tests/accessor/test_table_accessor.py --durations 0 --cov=woodwork --cov-config=../pyproject.toml --cov-report=xml:../coverage.xml - - if: ${{ matrix.python_version == 3.8 }} + - if: ${{ matrix.python_version == 3.9 }} name: Upload coverage to Codecov uses: codecov/codecov-action@v3 with: diff --git a/.github/workflows/tests_with_minimum_deps.yaml b/.github/workflows/tests_with_minimum_deps.yaml index a0e16a3b9..36fb228c4 100644 --- a/.github/workflows/tests_with_minimum_deps.yaml +++ b/.github/workflows/tests_with_minimum_deps.yaml @@ -7,7 +7,7 @@ on: - main jobs: py38_unit_tests_minimum_dependencies: - name: Tests - 3.8 Minimum Dependencies + name: Tests - 3.9 Minimum Dependencies runs-on: ubuntu-latest strategy: matrix: @@ -18,10 +18,10 @@ jobs: with: ref: ${{ github.event.pull_request.head.ref }} repository: ${{ github.event.pull_request.head.repo.full_name }} - - name: Set up python 3.8 + - name: Set up python 3.9 uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: 3.9 - name: Install woodwork - minimum tests requirements run: | python -m pip install -e . --no-dependencies diff --git a/.github/workflows/woodwork_main_airflow_performance_tests.yaml b/.github/workflows/woodwork_main_airflow_performance_tests.yaml index a1a63cbe5..9341b22d6 100644 --- a/.github/workflows/woodwork_main_airflow_performance_tests.yaml +++ b/.github/workflows/woodwork_main_airflow_performance_tests.yaml @@ -27,7 +27,7 @@ jobs: echo "PREVIOUS_HASH=$(git rev-parse --short HEAD~1)" >> $GITHUB_ENV echo "Previous commit hash: ${{ env.PREVIOUS_HASH }}" - name: Run airflow tests and generate report - run: | + run: | curl --location --request POST '${{ secrets.AIRFLOW_BASE_URL }}dags/woodwork_run_tests_generate_report/dagRuns' \ -u '${{ secrets.AIRFLOW_WW_USER }}:${{ secrets.AIRFLOW_WW_PASS }}' \ --header 'Content-Type: application/json' \ @@ -36,7 +36,7 @@ jobs: "description": null, "n_trials": 1, "pytest_args": {}, - "python_version": "3.8", + "python_version": "3.9", "scenarios_yaml": "woodwork_scenarios.yaml", "woodwork_branch_previous": "${{ env.PREVIOUS_HASH }}", "woodwork_branch_new": "${{ env.CURRENT_HASH }}", diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e99bfc805..1f3bd06e9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: - id: add-trailing-comma name: Add trailing comma - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: 'v0.1.13' + rev: 'v0.1.14' hooks: - id: ruff types_or: [ python, pyi, jupyter ] diff --git a/.readthedocs.yaml b/.readthedocs.yaml index fa0d362ed..90dbda93f 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -16,7 +16,7 @@ formats: [] build: os: "ubuntu-22.04" tools: - python: "3.8" + python: "3.9" apt_packages: - openjdk-11-jre-headless jobs: diff --git a/docs/source/guides/using_woodwork_with_dask_and_spark.ipynb b/docs/source/guides/using_woodwork_with_dask_and_spark.ipynb index cd11cace2..9e5f9ee85 100644 --- a/docs/source/guides/using_woodwork_with_dask_and_spark.ipynb +++ b/docs/source/guides/using_woodwork_with_dask_and_spark.ipynb @@ -322,7 +322,10 @@ "Woodwork allows column names of any format that is supported by the DataFrame. However, Dask DataFrames do not currently support integer column names.\n", "\n", "### Setting DataFrame Index\n", - "When specifying a Woodwork index with a pandas DataFrame, the underlying index of the DataFrame will be updated to match the column specified as the Woodwork index. When specifying a Woodwork index on a Dask or Spark DataFrame, however, the underlying index will remain unchanged.\n" + "When specifying a Woodwork index with a pandas DataFrame, the underlying index of the DataFrame will be updated to match the column specified as the Woodwork index. When specifying a Woodwork index on a Dask or Spark DataFrame, however, the underlying index will remain unchanged.\n", + "\n", + "### Dask `string[pyarrow]`\n", + "Woodwork may have issues with the new string storage model used by Dask. To workaround this, add `dask.config.set({'dataframe.convert-string': False})`, prior to running dask operations.\n" ] } ], diff --git a/docs/source/install.md b/docs/source/install.md index 90d1b96d3..6fcd3fd69 100644 --- a/docs/source/install.md +++ b/docs/source/install.md @@ -1,6 +1,6 @@ # Install -Woodwork is available for Python 3.8 - 3.11. It can be installed from PyPI, conda-forge, or from source. +Woodwork is available for Python 3.9 - 3.11. It can be installed from PyPI, conda-forge, or from source. To install Woodwork, run the following command: @@ -123,7 +123,7 @@ You can do so by installing it as a package inside a container (following the no creating a new image with Woodwork pre-installed, using the following commands in your `Dockerfile`: ```dockerfile -FROM --platform=linux/x86_64 python:3.8-slim-buster +FROM --platform=linux/x86_64 python:3.9-slim-buster RUN apt update && apt -y update RUN apt install -y build-essential RUN pip3 install --upgrade --quiet pip @@ -135,11 +135,11 @@ Woodwork has several other Python dependencies that are used only for specific m | Dependency | Min Version | Notes | |-------------------|-------------|----------------------------------------| -| boto3 | 1.10.45 | Required to read/write to URLs and S3 | +| boto3 | 1.34.32 | Required to read/write to URLs and S3 | | smart_open | 5.0.0 | Required to read/write to URLs and S3 | -| pyarrow | 4.0.1 | Required to serialize to parquet | -| dask[distributed] | 2021.10.0 | Required to use with Dask DataFrames | -| pyspark | 3.2.0 | Required to use with Spark DataFrames | +| pyarrow | 15.0.0 | Required to serialize to parquet | +| dask[distributed] | 2024.1.0 | Required to use with Dask DataFrames | +| pyspark | 3.5.0 | Required to use with Spark DataFrames | # Development diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index fc2e1d930..c3b41d042 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -13,6 +13,20 @@ Release Notes .. Thanks to the following people for contributing to this release: +v0.28.0 +==================== + * Enhancements + * Fixes + * Changes + * Upgraded numpy to < 2.0.0 :pr:`1799` + * Documentation Changes + * Added dask string storage note to "Other Limitations" in Dask documentation :pr:`1799` + * Testing Changes + * Upgraded moto and boto3 :pr:`1799` + + Thanks to the following people for contributing to this release: + :user:`cp2boston`, :user:`gsheni` + v0.27.0 Dec 12, 2023 ==================== * Fixes diff --git a/pyproject.toml b/pyproject.toml index d32abadf1..4180e6245 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,6 @@ classifiers = [ "Topic :: Scientific/Engineering", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -28,14 +27,14 @@ maintainers = [ ] keywords = ["data science", "machine learning", "typing"] license = {file = "LICENSE"} -requires-python = ">=3.8,<4" +requires-python = ">=3.9,<4" dependencies = [ "pandas >= 1.4.3", - "scikit-learn >= 0.22", + "scikit-learn >= 1.1.0", "python-dateutil >= 2.8.1", "scipy >= 1.10.0", "importlib-resources >= 5.10.0", - "numpy >= 1.22.0, <1.25.0", + "numpy >= 1.25.0, <2.0.0", ] [project.urls] @@ -51,19 +50,19 @@ test = [ "pytest >= 7.0.1", "pytest-cov >= 2.10.1", "pytest-xdist >= 2.1.0", - "boto3 >= 1.10.45", - "moto[all] >= 3.0.7", + "boto3 >= 1.34.32", + "moto[all] >= 5.0.0", "smart-open >= 5.0.0", - "pyarrow >= 4.0.1, <13.0.0", + "pyarrow >= 14.0.1" ] dask = [ "dask[dataframe] >= 2022.11.1", ] spark = [ - "pyspark >= 3.2.2", - "pandas >= 1.4.3, <2.0.0", - "numpy < 1.24.0", - "pyarrow >= 4.0.1, <13.0.0", + "pyspark >= 3.5.0", + "pandas >= 2.0.0", + "numpy >= 1.25.0", + "pyarrow >= 14.0.1", ] updater = [ "alteryx-open-src-update-checker >= 3.1.0" @@ -83,8 +82,7 @@ docs = [ dev = [ "ruff >= 0.1.6", "pre-commit >= 2.20.0", - "click >= 7.1.2, <8.1.0", - "woodwork[docs, dask, spark, test]", + "click >= 8.1.7" ] complete = [ "woodwork[dask, spark, updater]", diff --git a/woodwork/serializers/parquet_serializer.py b/woodwork/serializers/parquet_serializer.py index b81dba1cc..c2ac28405 100644 --- a/woodwork/serializers/parquet_serializer.py +++ b/woodwork/serializers/parquet_serializer.py @@ -111,7 +111,7 @@ def _save_parquet_table_to_disk(self): **table_metadata, } table = table.replace_schema_metadata(combined_meta) - pq.write_table(table, update_file) + pq.write_table(table, update_file, use_deprecated_int96_timestamps=True) # Remove checksum files which prevent deserialization if present due to updated parquet header crc_files = [f for f in files if Path(f).suffix == ".crc"] diff --git a/woodwork/statistics_utils/_get_histogram_values.py b/woodwork/statistics_utils/_get_histogram_values.py index c41905b0b..98c6dc845 100644 --- a/woodwork/statistics_utils/_get_histogram_values.py +++ b/woodwork/statistics_utils/_get_histogram_values.py @@ -12,7 +12,16 @@ def _get_histogram_values(series, bins=10): histogram (list(dict)): a list of dictionary with keys `bins` and `frequency` """ - values = pd.cut(series, bins=bins, duplicates="drop").value_counts().sort_index() + + if pd.api.types.is_numeric_dtype(series.dtype) or pd.api.types.is_bool_dtype( + series.dtype, + ): + series = series.astype(float) + values = ( + pd.cut(x=series.to_numpy(), bins=bins, duplicates="drop") + .value_counts() + .sort_index() + ) df = values.reset_index() df.columns = ["bins", "frequency"] results = [] diff --git a/woodwork/tests/accessor/test_column_accessor.py b/woodwork/tests/accessor/test_column_accessor.py index 66c4667e1..3f9757c0f 100644 --- a/woodwork/tests/accessor/test_column_accessor.py +++ b/woodwork/tests/accessor/test_column_accessor.py @@ -101,7 +101,7 @@ def test_accessor_init_with_schema_errors(sample_series): new_dtype = "