diff --git a/pyproject.toml b/pyproject.toml index 3ea17821f..628356fb5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ dependencies = [ "packaging >=24.0", "pandas >=2.2", "pint >=0.18", - "platformdirs >=3.2", + "pooch >=1.8.0", "pyarrow >=15.0.0", # Strongly encouraged for pandas v2.2.0+ "pyyaml >=6.0.1", "scikit-learn >=0.21.3", @@ -79,8 +79,6 @@ dev = [ "nbval >=0.11.0", "pandas-stubs >=2.2", "pip >=24.0", - "platformdirs >=3.2", - "pooch >=1.8.0", "pre-commit >=3.7", "pylint >=3.2.4", "pytest >=8.0.0", diff --git a/tests/conftest.py b/tests/conftest.py index 95c36f4eb..e98fb784f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,10 +13,10 @@ from xclim.core import indicator from xclim.core.calendar import max_doy from xclim.testing import helpers +from xclim.testing.helpers import default_cache_dir # noqa from xclim.testing.helpers import nimbus as _nimbus +from xclim.testing.helpers import open_dataset as _open_dataset from xclim.testing.helpers import test_timeseries -from xclim.testing.utils import default_cache_dir # noqa -from xclim.testing.utils import open_dataset as _open_dataset @pytest.fixture @@ -26,21 +26,7 @@ def random() -> np.random.Generator: @pytest.fixture def tmp_netcdf_filename(tmpdir): - yield Path(tmpdir).joinpath("testfile.nc") - - -@pytest.fixture(autouse=True, scope="session") -def threadsafe_data_dir(tmp_path_factory): - yield Path(tmp_path_factory.getbasetemp().joinpath("data")) - - -@pytest.fixture(autouse=True, scope="session") -def nimbus(threadsafe_data_dir): - yield _nimbus( - data_dir=threadsafe_data_dir, - repo=helpers.TESTDATA_REPO_URL, - branch=helpers.TESTDATA_BRANCH, - ) + return Path(tmpdir).joinpath("testfile.nc") @pytest.fixture @@ -57,6 +43,11 @@ def _lat_series(values): return _lat_series +@pytest.fixture +def timeseries(): + return test_timeseries + + @pytest.fixture def tas_series(): """Return mean temperature time series.""" @@ -309,40 +300,30 @@ def rlus_series(): @pytest.fixture(scope="session") -def cmip3_day_tas(threadsafe_data_dir): - # xr.set_options(enable_cftimeindex=False) - ds = _open_dataset( - "cmip3/tas.sresb1.giss_model_e_r.run1.atm.da.nc", - cache_dir=threadsafe_data_dir, - branch=helpers.TESTDATA_BRANCH, - engine="h5netcdf", - ) - yield ds.tas - ds.close() +def threadsafe_data_dir(tmp_path_factory): + return Path(tmp_path_factory.getbasetemp().joinpath("data")) @pytest.fixture(scope="session") -def get_file(nimbus): - def _get_session_scoped_file(file: str): - nimbus.fetch(file) - - return _get_session_scoped_file +def nimbus(threadsafe_data_dir): + return _nimbus( + data_dir=threadsafe_data_dir, + repo=helpers.TESTDATA_REPO_URL, + branch=helpers.TESTDATA_BRANCH, + ) @pytest.fixture(scope="session") -def open_dataset(threadsafe_data_dir): - def _open_session_scoped_file( - file: str | os.PathLike, branch: str = helpers.TESTDATA_BRANCH, **xr_kwargs - ): +def open_dataset(nimbus): + def _open_session_scoped_file(file: str | os.PathLike, **xr_kwargs): + xr_kwargs.setdefault("cache", True) xr_kwargs.setdefault("engine", "h5netcdf") - return _open_dataset( - file, cache_dir=threadsafe_data_dir, branch=branch, **xr_kwargs - ) + return _open_dataset(file, cache_dir=nimbus.path, **xr_kwargs) return _open_session_scoped_file -@pytest.fixture +@pytest.fixture(scope="session") def official_indicators(): # Remove unofficial indicators (as those created during the tests, and those from YAML-built modules) registry_cp = indicator.registry.copy() @@ -352,17 +333,39 @@ def official_indicators(): return registry_cp -@pytest.fixture(scope="function") -def atmosds(threadsafe_data_dir) -> xr.Dataset: +@pytest.fixture +def lafferty_sriver_ds(nimbus) -> xr.Dataset: + """Get data from Lafferty & Sriver unit test. + + Notes + ----- + https://github.com/david0811/lafferty-sriver_2023_npjCliAtm/tree/main/unit_test + """ + fn = nimbus.fetch( + "uncertainty_partitioning/seattle_avg_tas.csv", + ) + + df = pd.read_csv(fn, parse_dates=["time"]).rename( + columns={"ssp": "scenario", "ensemble": "downscaling"} + ) + + # Make xarray dataset + return xr.Dataset.from_dataframe( + df.set_index(["scenario", "model", "downscaling", "time"]) + ) + + +@pytest.fixture +def atmosds(nimbus) -> xr.Dataset: + """Get synthetic atmospheric dataset.""" return _open_dataset( - threadsafe_data_dir.joinpath("atmosds.nc"), - cache_dir=threadsafe_data_dir, - branch=helpers.TESTDATA_BRANCH, + "atmosds.nc", + cache_dir=nimbus.path, engine="h5netcdf", ).load() -@pytest.fixture(scope="function") +@pytest.fixture(scope="session") def ensemble_dataset_objects() -> dict[str, str]: edo = dict() edo["nc_files_simple"] = [ @@ -378,8 +381,8 @@ def ensemble_dataset_objects() -> dict[str, str]: return edo -@pytest.fixture(scope="session", autouse=True) -def gather_session_data(threadsafe_data_dir, worker_id): +@pytest.fixture(autouse=True, scope="session") +def gather_session_data(request, nimbus, worker_id): """Gather testing data on pytest run. When running pytest with multiple workers, one worker will copy data remotely to _default_cache_dir while @@ -389,25 +392,13 @@ def gather_session_data(threadsafe_data_dir, worker_id): Additionally, this fixture is also used to generate the `atmosds` synthetic testing dataset. """ helpers.testing_setup_warnings() - helpers.gather_testing_data(threadsafe_data_dir, worker_id) - helpers.generate_atmos(threadsafe_data_dir) - - -@pytest.fixture(scope="session", autouse=True) -def cleanup(request): - """Cleanup a testing file once we are finished. - - This flag prevents remote data from being downloaded multiple times in the same pytest run. - """ + helpers.gather_testing_data(nimbus.path, worker_id) + helpers.generate_atmos(nimbus.path) def remove_data_written_flag(): + """Cleanup cache folder once we are finished.""" flag = default_cache_dir.joinpath(".data_written") if flag.exists(): flag.unlink() request.addfinalizer(remove_data_written_flag) - - -@pytest.fixture -def timeseries(): - return test_timeseries diff --git a/tests/test_analog.py b/tests/test_analog.py index 72857b007..2608df226 100644 --- a/tests/test_analog.py +++ b/tests/test_analog.py @@ -58,8 +58,8 @@ def test_exact_randn(exact_randn): @pytest.mark.slow @pytest.mark.parametrize("method", xca.metrics.keys()) def test_spatial_analogs(method, open_dataset): - diss = open_dataset("SpatialAnalogs/dissimilarity") - data = open_dataset("SpatialAnalogs/indicators") + diss = open_dataset("SpatialAnalogs/dissimilarity.nc") + data = open_dataset("SpatialAnalogs/indicators.nc") target = data.sel(lat=46.1875, lon=-72.1875, time=slice("1970", "1990")) candidates = data.sel(time=slice("1970", "1990")) @@ -75,7 +75,7 @@ def test_spatial_analogs(method, open_dataset): def test_unsupported_spatial_analog_method(open_dataset): method = "KonMari" - data = open_dataset("SpatialAnalogs/indicators") + data = open_dataset("SpatialAnalogs/indicators.nc") target = data.sel(lat=46.1875, lon=-72.1875, time=slice("1970", "1990")) candidates = data.sel(time=slice("1970", "1990")) @@ -87,8 +87,8 @@ def test_unsupported_spatial_analog_method(open_dataset): def test_spatial_analogs_multi_index(open_dataset): # Test multi-indexes - diss = open_dataset("SpatialAnalogs/dissimilarity") - data = open_dataset("SpatialAnalogs/indicators") + diss = open_dataset("SpatialAnalogs/dissimilarity.nc") + data = open_dataset("SpatialAnalogs/indicators.nc") target = data.sel(lat=46.1875, lon=-72.1875, time=slice("1970", "1990")) candidates = data.sel(time=slice("1970", "1990")) diff --git a/tests/test_atmos.py b/tests/test_atmos.py index 10d5d0efe..23929550d 100644 --- a/tests/test_atmos.py +++ b/tests/test_atmos.py @@ -94,7 +94,7 @@ def test_humidex(tas_series): def test_heat_index(atmosds): - # Keep just Montreal values for summertime as we need tas > 20 degC + # Keep just Montreal values for summer as we need tas > 20 degC tas = atmosds.tasmax[1][150:170] hurs = atmosds.hurs[1][150:170] diff --git a/tests/test_indices.py b/tests/test_indices.py index a9386087f..69142a077 100644 --- a/tests/test_indices.py +++ b/tests/test_indices.py @@ -2562,12 +2562,14 @@ def test_simple(self, open_dataset, ind, exp): out = ind(ds.tas.sel(location="Victoria")) np.testing.assert_almost_equal(out[0], exp, decimal=4) - def test_indice_against_icclim(self, cmip3_day_tas): + def test_indice_against_icclim(self, open_dataset): from xclim.indicators import icclim # noqa + cmip3_tas = open_dataset("cmip3/tas.sresb1.giss_model_e_r.run1.atm.da.nc").tas + with set_options(cf_compliance="log"): - ind = xci.tg_mean(cmip3_day_tas) - icclim = icclim.TG(cmip3_day_tas) + ind = xci.tg_mean(cmip3_tas) + icclim = icclim.TG(cmip3_tas) np.testing.assert_array_equal(icclim, ind) diff --git a/tests/test_partitioning.py b/tests/test_partitioning.py index 54e27d823..f34691985 100644 --- a/tests/test_partitioning.py +++ b/tests/test_partitioning.py @@ -1,7 +1,6 @@ from __future__ import annotations import numpy as np -import pandas as pd import xarray as xr from xclim.ensembles import fractional_uncertainty, hawkins_sutton, lafferty_sriver @@ -108,19 +107,8 @@ def test_lafferty_sriver_synthetic(random): lafferty_sriver(da, sm=sm) -def test_lafferty_sriver(get_file): - seattle = get_file("uncertainty_partitioning/seattle_avg_tas.csv") - - df = pd.read_csv(seattle, parse_dates=["time"]).rename( - columns={"ssp": "scenario", "ensemble": "downscaling"} - ) - - # Make xarray dataset - ds = xr.Dataset.from_dataframe( - df.set_index(["scenario", "model", "downscaling", "time"]) - ) - - _g, u = lafferty_sriver(ds.tas) +def test_lafferty_sriver(lafferty_sriver_ds): + _g, u = lafferty_sriver(lafferty_sriver_ds.tas) fu = fractional_uncertainty(u) diff --git a/tests/test_testing_utils.py b/tests/test_testing_utils.py index 63e0881a0..35646f12e 100644 --- a/tests/test_testing_utils.py +++ b/tests/test_testing_utils.py @@ -3,14 +3,14 @@ import platform import sys from pathlib import Path -from urllib.error import URLError import numpy as np import pytest from xarray import Dataset -import xclim.testing.utils as utilities from xclim import __version__ as __xclim_version__ +from xclim.testing import helpers +from xclim.testing import utils as utilities from xclim.testing.helpers import test_timeseries as timeseries @@ -39,52 +39,9 @@ def file_md5_checksum(f_name): hash_md5.update(f.read()) return hash_md5.hexdigest() - @pytest.mark.requires_internet - def test_get_failure(self, tmp_path): - bad_repo_address = "https://github.com/beard/of/zeus/" - with pytest.raises(FileNotFoundError): - utilities._get( - Path("san_diego", "60_percent_of_the_time_it_works_everytime"), - bad_repo_address, - "main", - tmp_path, - ) - - @pytest.mark.requires_internet - def test_open_dataset_with_bad_file(self, tmp_path): - cmip3_folder = tmp_path.joinpath("main", "cmip3") - cmip3_folder.mkdir(parents=True) - - cmip3_file = "tas.sresb1.giss_model_e_r.run1.atm.da.nc" - Path(cmip3_folder, cmip3_file).write_text("This file definitely isn't right.") - - cmip3_md5 = f"{cmip3_file}.md5" - bad_cmip3_md5 = "bc51206e6462fc8ed08fd4926181274c" - Path(cmip3_folder, cmip3_md5).write_text(bad_cmip3_md5) - - # Check for raised warning for local file md5 sum and remote md5 sum - with pytest.warns(UserWarning): - new_cmip3_file = utilities._get( - Path("cmip3", cmip3_file), - github_url="https://github.com/Ouranosinc/xclim-testdata", - branch="main", - cache_dir=tmp_path, - ) - - # Ensure that the new cmip3 file is in the cache directory - assert ( - self.file_md5_checksum(Path(cmip3_folder, new_cmip3_file)) != bad_cmip3_md5 - ) - - # Ensure that the md5 file was updated at the same time - assert ( - self.file_md5_checksum(Path(cmip3_folder, new_cmip3_file)) - == Path(cmip3_folder, cmip3_md5).read_text() - ) - @pytest.mark.requires_internet def test_open_testdata(self): - ds = utilities.open_dataset( + ds = helpers.open_dataset( Path("cmip5/tas_Amon_CanESM2_rcp85_r1i1p1_200701-200712"), engine="h5netcdf" ) assert ds.lon.size == 128 @@ -126,22 +83,3 @@ def test_release_notes_file_not_implemented(self, tmp_path): temp_filename = tmp_path.joinpath("version_info.txt") with pytest.raises(NotImplementedError): utilities.publish_release_notes(style="qq", file=temp_filename) - - -class TestTestingFileAccessors: - def test_unsafe_urls(self): - with pytest.raises( - ValueError, match="GitHub URL not secure: 'ftp://domain.does.not.exist/'." - ): - utilities.open_dataset( - "doesnt_exist.nc", github_url="ftp://domain.does.not.exist/" - ) - - def test_malicious_urls(self): - with pytest.raises( - URLError, - match="urlopen error OPeNDAP URL is not well-formed: 'doesnt_exist.nc'", - ): - utilities.open_dataset( - "doesnt_exist.nc", dap_url="Robert'); DROP TABLE STUDENTS; --" - ) diff --git a/xclim/cli.py b/xclim/cli.py index 67a6da1eb..1df887b0d 100644 --- a/xclim/cli.py +++ b/xclim/cli.py @@ -11,13 +11,17 @@ import click import xarray as xr -from dask.diagnostics import ProgressBar +from dask.diagnostics.progress import ProgressBar import xclim as xc from xclim.core.dataflags import DataQualityException, data_flags, ecad_compliant from xclim.core.utils import InputKind -from xclim.testing.helpers import TESTDATA_BRANCH, populate_testing_data -from xclim.testing.utils import _default_cache_dir, publish_release_notes, show_versions +from xclim.testing.helpers import ( + TESTDATA_BRANCH, + default_cache_dir, + populate_testing_data, +) +from xclim.testing.utils import publish_release_notes, show_versions distributed = False try: @@ -169,7 +173,7 @@ def prefetch_testing_data(ctx, branch): f"Gathering testing data from xclim-testdata `{testdata_branch}` branch..." ) click.echo(populate_testing_data(branch=testdata_branch)) - click.echo(f"Testing data saved to `{_default_cache_dir}`.") + click.echo(f"Testing data saved to `{default_cache_dir}`.") ctx.exit() diff --git a/xclim/testing/conftest.py b/xclim/testing/conftest.py index 12af10934..7e175e975 100644 --- a/xclim/testing/conftest.py +++ b/xclim/testing/conftest.py @@ -11,12 +11,11 @@ import pytest from xclim.testing import helpers -from xclim.testing.utils import _default_cache_dir # noqa -from xclim.testing.utils import open_dataset as _open_dataset +from xclim.testing.helpers import open_dataset as _open_dataset @pytest.fixture(autouse=True, scope="session") -def threadsafe_data_dir(tmp_path_factory) -> Path: +def threadsafe_data_dir(tmp_path_factory): """Return a threadsafe temporary directory for storing testing data.""" yield Path(tmp_path_factory.getbasetemp().joinpath("data")) diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index 8b1b687a1..81d2ce31a 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -6,15 +6,14 @@ import logging import os import re -import shutil -import tempfile import time import warnings from datetime import datetime as dt from pathlib import Path from shutil import copytree from sys import platform -from urllib.error import HTTPError +from urllib.error import HTTPError, URLError +from urllib.parse import urljoin, urlparse import numpy as np import pandas as pd @@ -23,6 +22,8 @@ from dask.callbacks import Callback from filelock import FileLock from packaging.version import Version +from xarray import Dataset +from xarray import open_dataset as _open_dataset try: from pytest_socket import SocketBlockedError @@ -37,11 +38,14 @@ longwave_upwelling_radiation_from_net_downwelling, shortwave_upwelling_radiation_from_net_downwelling, ) -from xclim.testing.utils import default_cache_dir -from xclim.testing.utils import open_dataset as _open_dataset + +logger = logging.getLogger("xclim") default_testdata_version = "v2023.12.14" +"""Default version of the testing data to use when fetching datasets.""" +default_cache_dir = Path(pooch.os_cache("xclim-testdata")) +"""Default location for the testing data cache.""" TESTDATA_REPO_URL = str( os.getenv("XCLIM_TESTDATA_REPO_URL", "https://github.com/Ouranosinc/xclim-testdata") @@ -122,33 +126,17 @@ $ env XCLIM_DATA_DIR="/path/to/my/data" pytest """ -DATA_UPDATES = bool(os.getenv("XCLIM_DATA_UPDATES")) -"""Sets whether to allow updates to the testing datasets. - -If set to ``True``, the data files will be downloaded even if the upstream hashes do not match. - -Notes ------ -When running tests locally, this can be set for both `pytest` and `tox` by exporting the variable: - -.. code-block:: console - - $ export XCLIM_DATA_UPDATES=True - -or setting the variable at runtime: - -.. code-block:: console - - $ env XCLIM_DATA_UPDATES=True pytest -""" __all__ = [ - "DATA_UPDATES", "PREFETCH_TESTING_DATA", "TESTDATA_BRANCH", "add_example_file_paths", "assert_lazy", + "default_cache_dir", "generate_atmos", + "nimbus", + "open_dataset", + "populate_testing_data", "test_timeseries", ] @@ -181,100 +169,48 @@ def testing_setup_warnings(): ) -def load_registry( - file: str | Path | None = None, - repo: str = TESTDATA_REPO_URL, - branch: str = TESTDATA_BRANCH, -) -> dict[str, str]: +def load_registry() -> dict[str, str]: """Load the registry file for the test data. - Parameters - ---------- - file : str or Path, optional - Path to the registry file. If not provided, the registry file found within the package data will be used. - Returns ------- dict Dictionary of filenames and hashes. """ - remote = f"{repo}/raw/{branch}/data" - - # Get registry file from package_data - if file is None: - registry_file = Path(str(ilr.files("xclim").joinpath("testing/registry.txt"))) - if not registry_file.exists(): - registry_file.touch() - url = f"{remote}/{registry_file.name}" - try: - with tempfile.TemporaryDirectory() as tempdir: - remote_registry_file = pooch.retrieve( - url=url, - known_hash=None, - path=tempdir, - fname="registry.txt", - ) - # Check if the local registry file matches the remote registry - if pooch.file_hash(remote_registry_file) != pooch.file_hash( - registry_file.as_posix() - ): - warnings.warn( - "Local registry file does not match remote registry file." - ) - shutil.move(remote_registry_file, registry_file) - except FileNotFoundError: - warnings.warn( - "Registry file not accessible in remote repository. " - "Aborting file retrieval and using local registry file." - ) - except SocketBlockedError: - warnings.warn( - "Testing suite is being run with `--disable-socket`. Using local registry file." - ) - if not registry_file.exists(): - raise FileNotFoundError( - f"Local registry file not found: {registry_file}. " - "Testing setup cannot proceed without registry file." - ) - else: - registry_file = Path(file) - if not registry_file.exists(): - raise FileNotFoundError(f"Registry file not found: {registry_file}") - - logging.info("Registry file found: %s", registry_file) + registry_file = Path(str(ilr.files("xclim").joinpath("testing/registry.txt"))) + if not registry_file.exists(): + raise FileNotFoundError(f"Registry file not found: {registry_file}") # Load the registry file - registry = dict() - with registry_file.open() as buffer: - for entry in buffer.readlines(): - registry[entry.split()[0]] = entry.split()[1] - + with registry_file.open() as f: + registry = {line.split()[0]: line.split()[1] for line in f} return registry def nimbus( # noqa: PR01 data_dir: str | Path = CACHE_DIR, - data_updates: bool = DATA_UPDATES, repo: str = TESTDATA_REPO_URL, branch: str = TESTDATA_BRANCH, + data_updates: bool = True, ) -> pooch.Pooch: - """Pooch registry instance for xhydro test data. + """Pooch registry instance for xclim test data. Parameters ---------- data_dir : str or Path Path to the directory where the data files are stored. - data_updates : bool - If True, allow updates to the data files. + repo : str URL of the repository to use when fetching testing datasets. branch : str Branch of repository to use when fetching testing datasets. + data_updates : bool + If True, allow updates to the data files. Default is True. Returns ------- pooch.Pooch - Pooch instance for the xhydro test data. + Pooch instance for the xclim test data. Notes ----- @@ -282,8 +218,6 @@ def nimbus( # noqa: PR01 - ``XCLIM_DATA_DIR``: If this environment variable is set, it will be used as the base directory to store the data files. The directory should be an absolute path (i.e., it should start with ``/``). Otherwise, the default location will be used (based on ``platformdirs``, see :py:func:`pooch.os_cache`). - - ``XCLIM_DATA_UPDATES``: If this environment variable is set, then the data files will be downloaded even if the - upstream hashes do not match. This is useful if you want to always use the latest version of the data files. - ``XCLIM_TESTDATA_REPO_URL``: If this environment variable is set, it will be used as the URL of the repository to use when fetching datasets. Otherwise, the default repository will be used. - ``XCLIM_TESTDATA_BRANCH``: If this environment variable is set, it will be used as the branch of the repository @@ -302,22 +236,68 @@ def nimbus( # noqa: PR01 data = xr.open_dataset(example_file) """ remote = f"{repo}/raw/{branch}/data" - return pooch.create( path=data_dir, base_url=remote, version=default_testdata_version, version_dev=branch, allow_updates=data_updates, - registry=load_registry(repo=repo, branch=branch), + registry=load_registry(), ) +# idea copied from raven that it borrowed from xclim that borrowed it from xarray that was borrowed from Seaborn +def open_dataset( + name: str | os.PathLike[str], + dap_url: str | None = None, + cache_dir: str | os.PathLike[str] = default_cache_dir, + **kwargs, +) -> Dataset: + r"""Open a dataset from the online GitHub-like repository. + + If a local copy is found then always use that to avoid network traffic. + + Parameters + ---------- + name : str + Name of the file containing the dataset. + dap_url : str, optional + URL to OPeNDAP folder where the data is stored. If supplied, supersedes github_url. + cache_dir : Path + The directory in which to search for and write cached data. + \*\*kwargs + For NetCDF files, keywords passed to :py:func:`xarray.open_dataset`. + + Returns + ------- + Union[Dataset, Path] + + See Also + -------- + xarray.open_dataset + """ + if dap_url: + try: + return _open_dataset( + audit_url(urljoin(dap_url, str(name)), context="OPeNDAP"), **kwargs + ) + except (OSError, URLError): + msg = f"OPeNDAP file not read. Verify that the service is available: '{urljoin(dap_url, str(name))}'" + logger.error(msg) + raise + + local_file = Path(cache_dir).joinpath(name) + try: + ds = _open_dataset(local_file, **kwargs) + return ds + except OSError as err: + raise err + + def populate_testing_data( - registry_file: str | Path | None = None, temp_folder: Path | None = None, - repo: str | None = None, - branch: str | None = None, + repo: str = TESTDATA_REPO_URL, + branch: str = TESTDATA_BRANCH, local_cache: Path = default_cache_dir, ) -> None: """Populate the local cache with the testing data. @@ -329,37 +309,21 @@ def populate_testing_data( repo : str, optional URL of the repository to use when fetching testing datasets. branch : str, optional - Branch of hydrologie/xhydro-testdata to use when fetching testing datasets. + Branch of Ouranosinc/xclim-testdata to use when fetching testing datasets. local_cache : Path - Path to the local cache. Defaults to the location set by the platformdirs library. + The path to the local cache. Defaults to the location set by the platformdirs library. The testing data will be downloaded to this local cache. Returns ------- None """ - if repo is None: - _repo = TESTDATA_REPO_URL - else: - _repo = repo - if branch is None: - _branch = TESTDATA_BRANCH - else: - _branch = branch - if temp_folder is not None: - _local_cache = temp_folder - else: - _local_cache = Path(local_cache) - # Create the Pooch instance - n = nimbus(data_dir=_local_cache, repo=_repo, branch=_branch) - - # Load the registry file - registry = load_registry(file=registry_file, repo=_repo, branch=_branch) + n = nimbus(data_dir=temp_folder or local_cache, repo=repo, branch=branch) # Download the files errored_files = [] - for file in registry.keys(): + for file in load_registry(): try: n.fetch(file) except HTTPError: @@ -375,30 +339,28 @@ def populate_testing_data( raise SocketBlockedError(msg) from e else: logging.info("Files were downloaded successfully.") - finally: - if errored_files: - logging.error( - "The following files were unable to be downloaded: %s", - errored_files, - ) + + if errored_files: + logging.error( + "The following files were unable to be downloaded: %s", + errored_files, + ) -def generate_atmos(cache_dir: Path) -> dict[str, xr.DataArray]: +def generate_atmos(cache_dir: str | os.PathLike[str] | Path) -> dict[str, xr.DataArray]: """Create the `atmosds` synthetic testing dataset.""" - with _open_dataset( + with open_dataset( "ERA5/daily_surface_cancities_1990-1993.nc", cache_dir=cache_dir, - branch=TESTDATA_BRANCH, engine="h5netcdf", ) as ds: + rsus = shortwave_upwelling_radiation_from_net_downwelling(ds.rss, ds.rsds) + rlus = longwave_upwelling_radiation_from_net_downwelling(ds.rls, ds.rlds) tn10 = calendar.percentile_doy(ds.tasmin, per=10) t10 = calendar.percentile_doy(ds.tas, per=10) t90 = calendar.percentile_doy(ds.tas, per=90) tx90 = calendar.percentile_doy(ds.tasmax, per=90) - rsus = shortwave_upwelling_radiation_from_net_downwelling(ds.rss, ds.rsds) - rlus = longwave_upwelling_radiation_from_net_downwelling(ds.rls, ds.rlds) - ds = ds.assign( rsus=rsus, rlus=rlus, @@ -413,18 +375,19 @@ def generate_atmos(cache_dir: Path) -> dict[str, xr.DataArray]: ds.to_netcdf(atmos_file, engine="h5netcdf") # Give access to dataset variables by name in namespace - namespace = dict() - with _open_dataset( - atmos_file, branch=TESTDATA_BRANCH, cache_dir=cache_dir, engine="h5netcdf" - ) as ds: - for variable in ds.data_vars: - namespace[f"{variable}_dataset"] = ds.get(variable) + with open_dataset(atmos_file, cache_dir=cache_dir, engine="h5netcdf") as ds: + namespace = {f"{var}_dataset": ds[var] for var in ds.data_vars} return namespace -def gather_testing_data(threadsafe_data_dir: Path, worker_id: str): +def gather_testing_data( + threadsafe_data_dir: str | os.PathLike[str] | Path, worker_id: str +): """Gather testing data across workers.""" - if not default_cache_dir.exists() or PREFETCH_TESTING_DATA: + if ( + not default_cache_dir.joinpath(default_testdata_version).exists() + or PREFETCH_TESTING_DATA + ): if PREFETCH_TESTING_DATA: print("`XCLIM_PREFETCH_TESTING_DATA` set. Prefetching testing data...") if platform == "win32": @@ -432,7 +395,7 @@ def gather_testing_data(threadsafe_data_dir: Path, worker_id: str): "UNIX-style file-locking is not supported on Windows. " "Consider running `$ xclim prefetch_testing_data` to download testing data." ) - elif worker_id in ["master"]: + elif worker_id == "master": populate_testing_data(branch=TESTDATA_BRANCH) else: default_cache_dir.mkdir(exist_ok=True, parents=True) @@ -445,29 +408,33 @@ def gather_testing_data(threadsafe_data_dir: Path, worker_id: str): with test_data_being_written.acquire(): if lockfile.exists(): lockfile.unlink() - copytree(default_cache_dir, threadsafe_data_dir) + copytree(default_cache_dir.joinpath(default_testdata_version), threadsafe_data_dir) def add_example_file_paths() -> dict[str, str | list[xr.DataArray]]: """Create a dictionary of relevant datasets to be patched into the xdoctest namespace.""" - namespace: dict = dict() - namespace["path_to_ensemble_file"] = "EnsembleReduce/TestEnsReduceCriteria.nc" - namespace["path_to_pr_file"] = "NRCANdaily/nrcan_canada_daily_pr_1990.nc" - namespace["path_to_sfcWind_file"] = "ERA5/daily_surface_cancities_1990-1993.nc" - namespace["path_to_tas_file"] = "ERA5/daily_surface_cancities_1990-1993.nc" - namespace["path_to_tasmax_file"] = "NRCANdaily/nrcan_canada_daily_tasmax_1990.nc" - namespace["path_to_tasmin_file"] = "NRCANdaily/nrcan_canada_daily_tasmin_1990.nc" + namespace = { + "path_to_ensemble_file": "EnsembleReduce/TestEnsReduceCriteria.nc", + "path_to_pr_file": "NRCANdaily/nrcan_canada_daily_pr_1990.nc", + "path_to_sfcWind_file": "ERA5/daily_surface_cancities_1990-1993.nc", + "path_to_tas_file": "ERA5/daily_surface_cancities_1990-1993.nc", + "path_to_tasmax_file": "NRCANdaily/nrcan_canada_daily_tasmax_1990.nc", + "path_to_tasmin_file": "NRCANdaily/nrcan_canada_daily_tasmin_1990.nc", + "path_to_example_py": ( + Path(__file__).parent.parent.parent.parent + / "docs" + / "notebooks" + / "example.py" + ), + } # For core.utils.load_module example - namespace["path_to_example_py"] = ( - Path(__file__).parent.parent.parent.parent / "docs" / "notebooks" / "example.py" - ) - time = xr.cftime_range("1990-01-01", "2049-12-31", freq="D") + sixty_years = xr.cftime_range("1990-01-01", "2049-12-31", freq="D") namespace["temperature_datasets"] = [ xr.DataArray( - 12 * np.random.random_sample(time.size) + 273, - coords={"time": time}, + 12 * np.random.random_sample(sixty_years.size) + 273, + coords={"time": sixty_years}, name="tas", dims=("time",), attrs={ @@ -477,8 +444,8 @@ def add_example_file_paths() -> dict[str, str | list[xr.DataArray]]: }, ), xr.DataArray( - 12 * np.random.random_sample(time.size) + 273, - coords={"time": time}, + 12 * np.random.random_sample(sixty_years.size) + 273, + coords={"time": sixty_years}, name="tas", dims=("time",), attrs={ @@ -551,3 +518,24 @@ def _raise_on_compute(dsk: dict): assert_lazy = Callback(start=_raise_on_compute) """Context manager that raises an AssertionError if any dask computation is triggered.""" + + +def audit_url(url: str, context: str | None = None) -> str: + """Check if the URL is well-formed. + + Raises + ------ + URLError + If the URL is not well-formed. + """ + msg = "" + result = urlparse(url) + if result.scheme == "http": + msg = f"{context if context else ''} URL is not using secure HTTP: '{url}'".strip() + if not all([result.scheme, result.netloc]): + msg = f"{context if context else ''} URL is not well-formed: '{url}'".strip() + + if msg: + logger.error(msg) + raise URLError(msg) + return url diff --git a/xclim/testing/utils.py b/xclim/testing/utils.py index b396c4a99..6120582f3 100644 --- a/xclim/testing/utils.py +++ b/xclim/testing/utils.py @@ -16,18 +16,6 @@ from io import StringIO from pathlib import Path from typing import TextIO -from urllib.error import HTTPError, URLError -from urllib.parse import urljoin, urlparse -from urllib.request import urlretrieve - -import pooch -from xarray import Dataset -from xarray import open_dataset as _open_dataset - -try: - from pytest_socket import SocketBlockedError -except ImportError: - SocketBlockedError = None _xclim_deps = [ "xclim", @@ -51,152 +39,18 @@ "boltons", ] -default_cache_dir = Path(pooch.os_cache("xclim-testdata")) -"""Default location for the testing data cache.""" logger = logging.getLogger("xclim") + __all__ = [ - "audit_url", - "default_cache_dir", "list_input_variables", - "open_dataset", "publish_release_notes", "run_doctests", "show_versions", ] -def audit_url(url: str, context: str | None = None) -> str: - """Check if the URL is well-formed. - - Raises - ------ - URLError - If the URL is not well-formed. - """ - msg = "" - result = urlparse(url) - if result.scheme == "http": - msg = f"{context if context else ''} URL is not using secure HTTP: '{url}'".strip() - if not all([result.scheme, result.netloc]): - msg = f"{context if context else ''} URL is not well-formed: '{url}'".strip() - - if msg: - logger.error(msg) - raise URLError(msg) - return url - - -def _get( - name: Path, - github_url: str, - branch: str, - cache_dir: Path, -) -> Path: - cache_dir = cache_dir.absolute() - local_file = cache_dir / branch / name - - if not github_url.startswith("https"): - raise ValueError(f"GitHub URL not secure: '{github_url}'.") - - if not local_file.is_file(): - # This will always leave this directory on disk. - # We may want to add an option to remove it. - local_file.parent.mkdir(exist_ok=True, parents=True) - url = "/".join((github_url, "raw", branch, "data", name.as_posix())) - msg = f"Fetching remote file: {name.as_posix()}" - logger.info(msg) - try: - urlretrieve(audit_url(url), local_file) # noqa: S310 - except HTTPError as e: - msg = ( - f"{name.as_posix()} not accessible in remote repository: {url}. " - "Aborting file retrieval." - ) - raise FileNotFoundError(msg) from e - except SocketBlockedError as e: - msg = ( - f"Unable to access {name.as_posix()} online. Testing suite is being run with `--disable-socket`. " - f"If you intend to run tests with this option enabled, please download the file beforehand with the " - f"following console command: `xclim prefetch_testing_data`." - ) - raise FileNotFoundError(msg) from e - - return local_file - - -# idea copied from raven that it borrowed from xclim that borrowed it from xarray that was borrowed from Seaborn -def open_dataset( - name: str | os.PathLike[str], - dap_url: str | None = None, - github_url: str = "https://github.com/Ouranosinc/xclim-testdata", - branch: str = "main", - cache: bool = True, - cache_dir: Path = default_cache_dir, - **kwargs, -) -> Dataset: - r"""Open a dataset from the online GitHub-like repository. - - If a local copy is found then always use that to avoid network traffic. - - Parameters - ---------- - name : str or os.PathLike - Name of the file containing the dataset. - dap_url : str, optional - URL to OPeNDAP folder where the data is stored. If supplied, supersedes github_url. - github_url : str - URL to GitHub repository where the data is stored. - branch : str, optional - For GitHub-hosted files, the branch to download from. - cache_dir : Path - The directory in which to search for and write cached data. - cache : bool - If True, then cache data locally for use on subsequent calls. - \*\*kwargs - For NetCDF files, keywords passed to :py:func:`xarray.open_dataset`. - - Returns - ------- - Union[Dataset, Path] - - See Also - -------- - xarray.open_dataset - """ - if isinstance(name, (str, os.PathLike)): - name = Path(name) - - if dap_url is not None: - dap_file_address = urljoin(dap_url, str(name)) - try: - ds = _open_dataset(audit_url(dap_file_address, context="OPeNDAP"), **kwargs) - return ds - except URLError: - raise - except OSError: - msg = f"OPeNDAP file not read. Verify that the service is available: '{dap_file_address}'" - logger.error(msg) - raise OSError(msg) - - local_file = _get( - name=name, - github_url=github_url, - branch=branch, - cache_dir=cache_dir, - ) - - try: - ds = _open_dataset(local_file, **kwargs) - if not cache: - ds = ds.load() - local_file.unlink() - return ds - except OSError as err: - raise err - - def list_input_variables( submodules: Sequence[str] | None = None, realms: Sequence[str] | None = None ) -> dict: