Merge branch 'main' into groupby-shuffle

* main: (29 commits) Release notes for v2024.09.0 (pydata#9480) Fix `DataTree.coords.__setitem__` by adding `DataTreeCoordinates` class (pydata#9451) Rename DataTree's "ds" and "data" to "dataset" (pydata#9476) Update DataTree repr to indicate inheritance (pydata#9470) Bump pypa/gh-action-pypi-publish in the actions group (pydata#9460) Repo checker (pydata#9450) Add days_in_year and decimal_year to dt accessor (pydata#9105) remove parent argument from DataTree.__init__ (pydata#9465) Fix inheritance in DataTree.copy() (pydata#9457) Implement `DataTree.__delitem__` (pydata#9453) Add ASV for datatree.from_dict (pydata#9459) Make the first argument in DataTree.from_dict positional only (pydata#9446) Fix typos across the code, doc and comments (pydata#9443) DataTree should not be "Generic" (pydata#9445) Disallow passing a DataArray as data into the DataTree constructor (pydata#9444) Support additional dtypes in `resample` (pydata#9413) Shallow copy parent and children in DataTree constructor (pydata#9297) Bump minimum versions for dependencies (pydata#9434) Always include at least one category in random test data (pydata#9436) Avoid deep-copy when constructing groupby codes (pydata#9429) ...
dcherian · Sep 12, 2024 · 0679d2b · 0679d2b
2 parents 2d48690 + ed0418b
commit 0679d2b
Show file tree

Hide file tree

Showing 116 changed files with 2,240 additions and 1,061 deletions.
diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml
@@ -123,11 +123,11 @@ jobs:
  python xarray/util/print_versions.py
  - name: Install mypy
  run: |
- python -m pip install "mypy<1.9" --force-reinstall
+ python -m pip install "mypy" --force-reinstall
 
  - name: Run mypy
  run: |
- python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report xarray/
+ python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report
 
  - name: Upload mypy coverage to Codecov
  uses: codecov/[email protected]
@@ -138,7 +138,7 @@ jobs:
  name: codecov-umbrella
  fail_ci_if_error: false
 
- mypy39:
+ mypy-min:
  name: Mypy 3.10
  runs-on: "ubuntu-latest"
  needs: detect-ci-trigger
@@ -177,32 +177,30 @@ jobs:
  python xarray/util/print_versions.py
  - name: Install mypy
  run: |
- python -m pip install "mypy<1.9" --force-reinstall
+ python -m pip install "mypy" --force-reinstall
 
  - name: Run mypy
  run: |
- python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report xarray/
+ python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report
 
  - name: Upload mypy coverage to Codecov
  uses: codecov/[email protected]
  with:
  file: mypy_report/cobertura.xml
- flags: mypy39
+ flags: mypy-min
  env_vars: PYTHON_VERSION
  name: codecov-umbrella
  fail_ci_if_error: false
 
-
-
  pyright:
  name: Pyright
  runs-on: "ubuntu-latest"
  needs: detect-ci-trigger
  if: |
-  always()
-  && (
-  contains( github.event.pull_request.labels.*.name, 'run-pyright')
-  )
+ always()
+ && (
+ contains( github.event.pull_request.labels.*.name, 'run-pyright')
+ )
  defaults:
  run:
  shell: bash -l {0}
@@ -258,10 +256,10 @@ jobs:
  runs-on: "ubuntu-latest"
  needs: detect-ci-trigger
  if: |
-  always()
-  && (
-  contains( github.event.pull_request.labels.*.name, 'run-pyright')
-  )
+ always()
+ && (
+ contains( github.event.pull_request.labels.*.name, 'run-pyright')
+ )
  defaults:
  run:
  shell: bash -l {0}
@@ -312,8 +310,6 @@ jobs:
  name: codecov-umbrella
  fail_ci_if_error: false
 
-
-
  min-version-policy:
  name: Minimum Version Policy
  runs-on: "ubuntu-latest"

diff --git a/.github/workflows/pypi-release.yaml b/.github/workflows/pypi-release.yaml
@@ -88,7 +88,7 @@ jobs:
  path: dist
  - name: Publish package to TestPyPI
  if: github.event_name == 'push'
- uses: pypa/gh-action-pypi-publish@v1.9.0
+ uses: pypa/gh-action-pypi-publish@v1.10.1
  with:
  repository_url: https://test.pypi.org/legacy/
  verbose: true
@@ -111,6 +111,6 @@ jobs:
  name: releases
  path: dist
  - name: Publish package to PyPI
- uses: pypa/gh-action-pypi-publish@v1.9.0
+ uses: pypa/gh-action-pypi-publish@v1.10.1
  with:
  verbose: true
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,7 @@
 # https://pre-commit.com/
 ci:
  autoupdate_schedule: monthly
+ autoupdate_commit_msg: 'Update pre-commit hooks'
 exclude: 'xarray/datatree_.*'
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
@@ -13,7 +14,7 @@ repos:
  - id: mixed-line-ending
  - repo: https://github.com/astral-sh/ruff-pre-commit
  # Ruff version.
- rev: 'v0.6.2'
+ rev: 'v0.6.3'
  hooks:
  - id: ruff
  args: ["--fix", "--show-fixes"]

diff --git a/asv_bench/benchmarks/dataset_io.py b/asv_bench/benchmarks/dataset_io.py
@@ -724,7 +724,7 @@ class PerformanceBackend(xr.backends.BackendEntrypoint):
  def open_dataset(
  self,
  filename_or_obj: str | os.PathLike | None,
- drop_variables: tuple[str] = None,
+ drop_variables: tuple[str, ...] = None,
  *,
  mask_and_scale=True,
  decode_times=True,

diff --git a/asv_bench/benchmarks/datatree.py b/asv_bench/benchmarks/datatree.py
@@ -0,0 +1,15 @@
+import xarray as xr
+from xarray.core.datatree import DataTree
+
+
+class Datatree:
+ def setup(self):
+ run1 = DataTree.from_dict({"run1": xr.Dataset({"a": 1})})
+ self.d_few = {"run1": run1}
+ self.d_many = {f"run{i}": xr.Dataset({"a": 1}) for i in range(100)}
+
+ def time_from_dict_few(self):
+ DataTree.from_dict(self.d_few)
+
+ def time_from_dict_many(self):
+ DataTree.from_dict(self.d_many)
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -1,4 +1,5 @@
 # import flox to avoid the cost of first import
+import cftime
 import flox.xarray # noqa
 import numpy as np
 import pandas as pd
@@ -96,7 +97,7 @@ def setup(self, *args, **kwargs):
 
  requires_dask()
  super().setup(**kwargs)
- self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dataframe()
+ self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dask_dataframe()
  self.ds1d_mean = self.ds1d.groupby("b").mean().compute()
 
  def time_binary_op_2d(self):
@@ -169,7 +170,21 @@ class GroupByLongTime:
  def setup(self, use_cftime, use_flox):
  arr = np.random.randn(10, 10, 365 * 30)
  time = xr.date_range("2000", periods=30 * 365, use_cftime=use_cftime)
- self.da = xr.DataArray(arr, dims=("y", "x", "time"), coords={"time": time})
+
+ # GH9426 - deep-copying CFTime object arrays is weirdly slow
+ asda = xr.DataArray(time)
+ labeled_time = []
+ for year, month in zip(asda.dt.year, asda.dt.month, strict=True):
+ labeled_time.append(cftime.datetime(year, month, 1))
+
+ self.da = xr.DataArray(
+ arr,
+ dims=("y", "x", "time"),
+ coords={"time": time, "time2": ("time", labeled_time)},
+ )
+
+ def time_setup(self, use_cftime, use_flox):
+ self.da.groupby("time.month")
 
  def time_mean(self, use_cftime, use_flox):
  with xr.set_options(use_flox=use_flox):

diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
@@ -64,7 +64,7 @@ def time_rolling_long(self, func, pandas, use_bottleneck):
  def time_rolling_np(self, window_, min_periods, use_bottleneck):
  with xr.set_options(use_bottleneck=use_bottleneck):
  self.ds.rolling(x=window_, center=False, min_periods=min_periods).reduce(
- getattr(np, "nansum")
+ np.nansum
  ).load()
 
  @parameterized(

diff --git a/ci/requirements/bare-minimum.yml b/ci/requirements/bare-minimum.yml
@@ -11,6 +11,6 @@ dependencies:
  - pytest-env
  - pytest-xdist
  - pytest-timeout
- - numpy=1.23
+ - numpy=1.24
  - packaging=23.1
- - pandas=2.0
+ - pandas=2.1
diff --git a/ci/requirements/min-all-deps.yml b/ci/requirements/min-all-deps.yml
@@ -9,37 +9,37 @@ dependencies:
  # doc/user-guide/installing.rst, doc/user-guide/plotting.rst and setup.py.
  - python=3.10
  - array-api-strict=1.0 # dependency for testing the array api compat
- - boto3=1.26
+ - boto3=1.28
  - bottleneck=1.3
- - cartopy=0.21
+ - cartopy=0.22
  - cftime=1.6
  - coveralls
- - dask-core=2023.4
- - distributed=2023.4
+ - dask-core=2023.9
+ - distributed=2023.9
  # Flox > 0.8 has a bug with numbagg versions
  # It will require numbagg > 0.6
  # so we should just skip that series eventually
  # or keep flox pinned for longer than necessary
  - flox=0.7
- - h5netcdf=1.1
+ - h5netcdf=1.2
  # h5py and hdf5 tend to cause conflicts
  # for e.g. hdf5 1.12 conflicts with h5py=3.1
  # prioritize bumping other packages instead
  - h5py=3.8
  - hdf5=1.12
  - hypothesis
- - iris=3.4
+ - iris=3.7
  - lxml=4.9 # Optional dep of pydap
  - matplotlib-base=3.7
  - nc-time-axis=1.4
  # netcdf follows a 1.major.minor[.patch] convention
  # (see https://github.com/Unidata/netcdf4-python/issues/1090)
  - netcdf4=1.6.0
- - numba=0.56
+ - numba=0.57
  - numbagg=0.2.1
- - numpy=1.23
+ - numpy=1.24
  - packaging=23.1
- - pandas=2.0
+ - pandas=2.1
  - pint=0.22
  - pip
  - pydap=3.4
@@ -49,9 +49,9 @@ dependencies:
  - pytest-xdist
  - pytest-timeout
  - rasterio=1.3
- - scipy=1.10
+ - scipy=1.11
  - seaborn=0.12
  - sparse=0.14
  - toolz=0.12
- - typing_extensions=4.5
- - zarr=2.14
+ - typing_extensions=4.7
+ - zarr=2.16
diff --git a/design_notes/flexible_indexes_notes.md b/design_notes/flexible_indexes_notes.md
@@ -71,7 +71,7 @@ An `XarrayIndex` subclass must/should/may implement the following properties/met
 - a `data` property to access index's data and map it to coordinate data (see [Section 4](#4-indexvariable))
 - a `__getitem__()` implementation to propagate the index through DataArray/Dataset indexing operations
 - `equals()`, `union()` and `intersection()` methods for data alignment (see [Section 2.6](#26-using-indexes-for-data-alignment))
-- Xarray coordinate getters (see [Section 2.2.4](#224-implicit-coodinates))
+- Xarray coordinate getters (see [Section 2.2.4](#224-implicit-coordinates))
 - a method that may return a new index and that will be called when one of the corresponding coordinates is dropped from the Dataset/DataArray (multi-coordinate indexes)
 - `encode()`/`decode()` methods that would allow storage-agnostic serialization and fast-path reconstruction of the underlying index object(s) (see [Section 2.8](#28-index-encoding))
 - one or more "non-standard" methods or properties that could be leveraged in Xarray 3rd-party extensions like Dataset/DataArray accessors (see [Section 2.7](#27-using-indexes-for-other-purposes))

diff --git a/design_notes/grouper_objects.md b/design_notes/grouper_objects.md
@@ -166,7 +166,7 @@ where `|` represents chunk boundaries. A simple rechunking to
 ```
 000|111122|3333
 ```
-would make this resampling reduction an embarassingly parallel blockwise problem.
+would make this resampling reduction an embarrassingly parallel blockwise problem.
 
 Similarly consider monthly-mean climatologies for which the month numbers might be
 ```

diff --git a/design_notes/named_array_design_doc.md b/design_notes/named_array_design_doc.md
@@ -258,7 +258,7 @@ Questions:
  Variable.coarsen_reshape
  Variable.rolling_window
 
- Variable.set_dims # split this into broadcas_to and expand_dims
+ Variable.set_dims # split this into broadcast_to and expand_dims
 
 
 # Reordering/Reshaping

diff --git a/doc/api.rst b/doc/api.rst
@@ -530,9 +530,11 @@ Datetimelike properties
  DataArray.dt.quarter
  DataArray.dt.days_in_month
  DataArray.dt.daysinmonth
+ DataArray.dt.days_in_year
  DataArray.dt.season
  DataArray.dt.time
  DataArray.dt.date
+ DataArray.dt.decimal_year
  DataArray.dt.calendar
  DataArray.dt.is_month_start
  DataArray.dt.is_month_end

diff --git a/doc/user-guide/dask.rst b/doc/user-guide/dask.rst
@@ -298,7 +298,7 @@ Automatic parallelization with ``apply_ufunc`` and ``map_blocks``
 
 .. tip::
 
- Some problems can become embarassingly parallel and thus easy to parallelize
+ Some problems can become embarrassingly parallel and thus easy to parallelize
  automatically by rechunking to a frequency, e.g. ``ds.chunk(time=TimeResampler("YE"))``.
  See :py:meth:`Dataset.chunk` for more.
 
@@ -559,7 +559,7 @@ larger chunksizes.
 
 .. tip::
 
- Many time domain problems become amenable to an embarassingly parallel or blockwise solution
+ Many time domain problems become amenable to an embarrassingly parallel or blockwise solution
  (e.g. using :py:func:`xarray.map_blocks`, :py:func:`dask.array.map_blocks`, or
  :py:func:`dask.array.blockwise`) by rechunking to a frequency along the time dimension.
  Provide :py:class:`xarray.groupers.TimeResampler` objects to :py:meth:`Dataset.chunk` to do so.

diff --git a/doc/user-guide/data-structures.rst b/doc/user-guide/data-structures.rst
@@ -289,7 +289,7 @@ pressure that were made under various conditions:
 * the measurements were made on four different days;
 * they were made at two separate locations, which we will represent using
  their latitude and longitude; and
-* they were made using instruments by three different manufacutrers, which we
+* they were made using instruments by three different manufacturers, which we
  will refer to as `'manufac1'`, `'manufac2'`, and `'manufac3'`.
 
 .. ipython:: python

diff --git a/doc/user-guide/groupby.rst b/doc/user-guide/groupby.rst
@@ -305,6 +305,12 @@ Use grouper objects to group by multiple dimensions:
 
  from xarray.groupers import UniqueGrouper
 
+ da.groupby(["lat", "lon"]).sum()
+
+The above is sugar for using ``UniqueGrouper`` objects directly:
+
+.. ipython:: python
+
  da.groupby(lat=UniqueGrouper(), lon=UniqueGrouper()).sum()
 
 

diff --git a/doc/user-guide/pandas.rst b/doc/user-guide/pandas.rst
@@ -120,7 +120,7 @@ Particularly after a roundtrip, the following deviations are noted:
 
 - a non-dimension Dataset ``coordinate`` is converted into ``variable``
 - a non-dimension DataArray ``coordinate`` is not converted
-- ``dtype`` is not allways the same (e.g. "str" is converted to "object")
+- ``dtype`` is not always the same (e.g. "str" is converted to "object")
 - ``attrs`` metadata is not conserved
 
 To avoid these problems, the third-party `ntv-pandas <https://github.com/loco-philippe/ntv-pandas>`__ library offers lossless and reversible conversions between

diff --git a/doc/user-guide/testing.rst b/doc/user-guide/testing.rst
@@ -193,7 +193,7 @@ different type:
 
 .. ipython:: python
 
- def sparse_random_arrays(shape: tuple[int]) -> sparse._coo.core.COO:
+ def sparse_random_arrays(shape: tuple[int, ...]) -> sparse._coo.core.COO:
  """Strategy which generates random sparse.COO arrays"""
  if shape is None:
  shape = npst.array_shapes()