From 2ca9c377da40409804f52f4c50a0ea858582db79 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Wed, 15 May 2024 14:25:05 -0700
Subject: [PATCH 01/71] Work out a bunch of stuff

---
 src/climate_downscale/data.py                 |  4 +
 src/climate_downscale/extract/elevation.py    | 29 +++---
 .../extract/ncei_climate_stations.py          | 10 +-
 .../model/prepare_predictors.py               | 94 +++++++++++++++++++
 .../model/prepare_training_data.py            | 24 +++++
 5 files changed, 144 insertions(+), 17 deletions(-)
 create mode 100644 src/climate_downscale/model/prepare_predictors.py
 create mode 100644 src/climate_downscale/model/prepare_training_data.py

diff --git a/src/climate_downscale/data.py b/src/climate_downscale/data.py
index 609baea..60c6ec0 100644
--- a/src/climate_downscale/data.py
+++ b/src/climate_downscale/data.py
@@ -32,6 +32,10 @@ def ncei_climate_stations(self) -> Path:
     def srtm_elevation_gl1(self) -> Path:
         return self.extracted_data / "srtm_elevation_gl1"
 
+    @property
+    def open_topography_elevation(self) -> Path:
+        return self.extracted_data / "open_topography_elevation"
+
     @property
     def rub_local_climate_zones(self) -> Path:
         return self.extracted_data / "rub_local_climate_zones"
diff --git a/src/climate_downscale/extract/elevation.py b/src/climate_downscale/extract/elevation.py
index 261c172..3c42845 100644
--- a/src/climate_downscale/extract/elevation.py
+++ b/src/climate_downscale/extract/elevation.py
@@ -7,21 +7,22 @@
     with_output_directory,
     with_queue,
 )
+import tqdm
 
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
 
 API_ENDPOINT = "https://portal.opentopography.org/API/globaldem"
 
 ELEVATION_MODELS = [
-    "SRTMGL3",  # SRTM Global 3 arc second (90m)
-    "SRTMGL1",  # SRTM Global 1 arc second (30m)
-    "SRTMGL1_E",  # SRTM Global 1 arc second ellipsoidal height (30m)
-    "AW3D30",  # ALOS World 3D 30m
-    "AW3D30_E",  # ALOS World 3D 30m ellipsoidal height
+    "SRTMGL3",     # SRTM Global 3 arc second (90m)
+    "SRTMGL1",     # SRTM Global 1 arc second (30m)
+    "SRTMGL1_E",   # SRTM Global 1 arc second ellipsoidal height (30m)
+    "AW3D30",      # ALOS World 3D 30m
+    "AW3D30_E",    # ALOS World 3D 30m ellipsoidal height
     "SRTM15Plus",  # SRTM 15 arc second (500m)
-    "NASADEM",  # NASA DEM 1 arc second (30m)
-    "COP30",  # Copernicus 1 arc second (30m)
-    "COP90",  # Copernicus 3 arc second (90m)
+    "NASADEM",     # NASA DEM 1 arc second (30m)
+    "COP30",       # Copernicus 1 arc second (30m)
+    "COP90",       # Copernicus 3 arc second (90m)
 ]
 
 FETCH_SIZE = 5  # degrees, should be small enough for any model
@@ -38,7 +39,7 @@ def extract_elevation_main(
     key = cred_path.read_text().strip()
 
     params: dict[str, int | str] = {
-        "dem_type": model_name,
+        "demtype": model_name,
         "south": lat_start,
         "north": lat_start + FETCH_SIZE,
         "west": lon_start,
@@ -47,12 +48,12 @@ def extract_elevation_main(
         "API_Key": key,
     }
 
-    response = requests.get(API_ENDPOINT, params=params, stream=True, timeout=10)
+    response = requests.get(API_ENDPOINT, params=params, stream=True, timeout=30)
     response.raise_for_status()
 
-    out_path = cd_data.srtm_elevation_gl1 / f"{model_name}_{lat_start}_{lon_start}.tif"
+    out_path = cd_data.open_topography_elevation / f"{model_name}_{lat_start}_{lon_start}.tif"
     with out_path.open("wb") as fp:
-        for chunk in response.iter_content(chunk_size=None):
+        for chunk in tqdm.tqdm(response.iter_content(chunk_size=64 * 1024**2)):
             fp.write(chunk)
 
 
@@ -72,8 +73,8 @@ def extract_elevation_main(
 )
 @click.option(
     "--lon-start",
-    required=int,
-    type=float,
+    required=True,
+    type=int,
     help="Longitude of the top-left corner of the tile.",
 )
 def extract_elevation_task(
diff --git a/src/climate_downscale/extract/ncei_climate_stations.py b/src/climate_downscale/extract/ncei_climate_stations.py
index 1ecd096..43adeb0 100644
--- a/src/climate_downscale/extract/ncei_climate_stations.py
+++ b/src/climate_downscale/extract/ncei_climate_stations.py
@@ -29,10 +29,13 @@ def extract_ncei_climate_stations_main(output_dir: str | Path, year: str) -> Non
     shutil.unpack_archive(str(gz_path), year_dir)
 
     data = pd.concat([pd.read_csv(f) for f in year_dir.glob("*.csv")])
+    data['STATION'] = data['STATION'].astype(str)
     out_path = cd_data.ncei_climate_stations / f"{year}.parquet"
-    touch(out_path)
     data.to_parquet(out_path)
 
+    gz_path.unlink()
+    shutil.rmtree(year_dir)
+
 
 @click.command()  # type: ignore[arg-type]
 @with_choice(
@@ -52,9 +55,9 @@ def extract_ncei_climate_stations_task(output_dir: str, year: str) -> None:
 @with_queue()
 def extract_ncei_climate_stations(output_dir: str, queue: str) -> None:
     jobmon.run_parallel(
-        "extract_ncei_climate_stations",
+        "extract ncei",
         node_args={
-            "output_dir": [output_dir],
+            "output-dir": [output_dir],
             "year": EXTRACTION_YEARS,
         },
         task_resources={
@@ -66,3 +69,4 @@ def extract_ncei_climate_stations(output_dir: str, queue: str) -> None:
         },
         runner="cdtask",
     )
+
diff --git a/src/climate_downscale/model/prepare_predictors.py b/src/climate_downscale/model/prepare_predictors.py
new file mode 100644
index 0000000..910c572
--- /dev/null
+++ b/src/climate_downscale/model/prepare_predictors.py
@@ -0,0 +1,94 @@
+import geopandas as gpd
+import pandas as pd
+import xarray as xr
+import tqdm
+import matplotlib.pyplot as plt
+import rasterra as rt
+from affine import Affine
+import numpy as np
+from pathlib import Path
+
+def to_raster(ds, nodata, lat_col='lat', lon_col='lon'):
+    lat, lon = ds[lat_col].data, ds[lon_col].data
+
+    dlat = (lat[1:] - lat[:-1]).mean()
+    dlon = (lon[1:] - lon[:-1]).mean()
+
+    transform = Affine(
+        a=dlon,
+        b=0.,
+        c=lon[0],
+        d=0.,
+        e=-dlat,
+        f=lat[-1],
+    )
+    raster = rt.RasterArray(
+        data = ds.data,
+        transform=transform,
+        crs='EPSG:4326',
+        no_data_value=nodata,
+    )
+    return raster
+    
+def make_template(x_min, y_min, stride, resolution):
+    evenly_divides = (
+        (stride % resolution < 1e-12)
+        or (resolution - stride % resolution < 1e-12)
+    )
+    assert evenly_divides
+    
+    transform = Affine(
+        a=resolution,
+        b=0,
+        c=x_min,
+        d=0,
+        e=-resolution,
+        f=y_min + stride,
+    )
+    
+    n_pix = int(stride / resolution)
+    
+    data = np.zeros((n_pix, n_pix), dtype=np.int8)
+    template = rt.RasterArray(
+        data,
+        transform,
+        crs='EPSG:4326',
+        no_data_value=-1,
+    )
+    return template
+
+
+STRIDE = 30  # degrees
+PAD = 1
+lat_start = 0
+lon_start = 0
+
+longitudes = range(lon_start - PAD, lon_start + STRIDE + PAD)
+latitudes = range(lat_start - PAD, lat_start + STRIDE + PAD)
+
+template_era5 = make_template(x_min=lon_start, y_min=lat_start, stride=STRIDE, resolution=0.1)
+template_target = make_template(x_min=lon_start, y_min=lat_start, stride=STRIDE, resolution=0.01)
+
+root = Path("/mnt/share/erf/climate_downscale/extracted_data/open_topography_elevation/SRTM_GL3_srtm")
+paths = []
+for lon in longitudes:
+    lon_stub = f"E{lon:03}" if lon >= 0 else f"W{-lon:03}"
+        
+    for lat in range(lat_start, lat_start+STRIDE): 
+        if lat >= 30:
+            rel_path = f"North/North_30_60/N{lat:02}{lon_stub}.tif"
+        elif lat >=0:
+            rel_path = f"North/North_0_29/N{lat:02}{lon_stub}.tif"
+        else:
+            rel_path = f"South/S{-lat:02}{lon_stub}.tif"
+        
+        p = root / rel_path
+
+        if p.exists():
+            paths.append(p)
+
+elevation = rt.load_mf_raster(paths)
+
+elevation_target = elevation.resample_to(template_target, resampling='average')
+elevation_era5 = elevation.resample_to(template_era5, resampling='average').resample_to(template_target, resampling='nearest')
+elevation_anomaly = elevation_era5 - elevation_target
\ No newline at end of file
diff --git a/src/climate_downscale/model/prepare_training_data.py b/src/climate_downscale/model/prepare_training_data.py
new file mode 100644
index 0000000..6479c7c
--- /dev/null
+++ b/src/climate_downscale/model/prepare_training_data.py
@@ -0,0 +1,24 @@
+year = 2023
+
+climate_stations = pd.read_parquet(f'/mnt/share/erf/climate_downscale/extracted_data/ncei_climate_stations/{year}.parquet')
+column_map = {
+    "DATE": "date",
+    "LATITUDE": "lat",
+    "LONGITUDE": "lon",
+    "TEMP": "temperature",
+}
+climate_stations = climate_stations.rename(columns=column_map).loc[:, list(column_map.values())].dropna()
+climate_stations['date'] = pd.to_datetime(climate_stations['date'])
+climate_stations['year'] = climate_stations['date'].dt.year
+climate_stations['dayofyear'] = climate_stations['date'].dt.dayofyear
+climate_stations['temperature'] = 5/9 * (climate_stations['temperature'] - 32)
+climate_stations.loc[climate_stations.lon < 0, 'lon'] +=360
+
+era5 = xr.load_dataset(f'/mnt/share/erf/climate_downscale/extracted_data/era5_temperature_daily_mean/{year}_era5_temp_daily.nc')
+lat = xr.DataArray(climate_stations['lat'].values, dims=['points'])
+lon = xr.DataArray(climate_stations['lon'].values, dims=['points'])
+time = xr.DataArray(climate_stations['date'].values, dims=['points'])
+arr = era5.sel(latitude=lat, longitude=lon, time=time, method='nearest')
+if "expver" in arr.coords:
+    arr = arr.sel(expver=1).combine_first(arr.sel(expver=5))
+climate_stations['era5_temperature'] = arr['t2m'].to_numpy() + 273.15
\ No newline at end of file

From 83d5136bfebfac063b3ca6331f9c07c5b36b650c Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Wed, 15 May 2024 15:41:29 -0700
Subject: [PATCH 02/71] Add all boilerplate for command

---
 poetry.lock                                   |  20 +-
 pyproject.toml                                |   5 +-
 src/climate_downscale/cli.py                  |   4 +-
 src/climate_downscale/data.py                 |  51 +++++
 src/climate_downscale/model/__init__.py       |  12 ++
 .../model/prepare_predictors.py               | 192 ++++++++++--------
 .../model/prepare_training_data.py            |  37 ++--
 src/climate_downscale/utils.py                |  69 +++++++
 8 files changed, 286 insertions(+), 104 deletions(-)
 create mode 100644 src/climate_downscale/model/__init__.py
 create mode 100644 src/climate_downscale/utils.py

diff --git a/poetry.lock b/poetry.lock
index d723c9b..e003253 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1969,6 +1969,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2068,13 +2069,13 @@ test = ["boto3 (>=1.2.4)", "hypothesis", "packaging", "pytest (>=2.8.2)", "pytes
 
 [[package]]
 name = "rasterra"
-version = "0.5.9"
+version = "0.5.10"
 description = "A sleek, object-oriented interface designed for intuitive raster data manipulation in Python."
 optional = false
 python-versions = "<3.13,>=3.10"
 files = [
-    {file = "rasterra-0.5.9-py3-none-any.whl", hash = "sha256:1ef3c0e36564574f870cb919553087a8f46e761c4a87da9c385dd0fa4223293f"},
-    {file = "rasterra-0.5.9.tar.gz", hash = "sha256:ebe5f16df392aa2da8dc5214a70eeabb8679d33429aae877128fc299b31cede5"},
+    {file = "rasterra-0.5.10-py3-none-any.whl", hash = "sha256:9d281f98e4cb6375a12a7b09cc61dcb8a07c53ecab12e62c7147254ae04307b6"},
+    {file = "rasterra-0.5.10.tar.gz", hash = "sha256:6080156dc8395c7ba427d518ad5d8cbc4fe583ca372b281819807a4d0aacd543"},
 ]
 
 [package.dependencies]
@@ -2530,6 +2531,17 @@ files = [
 [package.dependencies]
 urllib3 = ">=2"
 
+[[package]]
+name = "types-tqdm"
+version = "4.66.0.20240417"
+description = "Typing stubs for tqdm"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "types-tqdm-4.66.0.20240417.tar.gz", hash = "sha256:16dce9ef522ea8d40e4f5b8d84dd8a1166eefc13ceee7a7e158bf0f1a1421a31"},
+    {file = "types_tqdm-4.66.0.20240417-py3-none-any.whl", hash = "sha256:248aef1f9986b7b8c2c12b3cb4399fc17dba0a29e7e3f3f9cd704babb879383d"},
+]
+
 [[package]]
 name = "typing-extensions"
 version = "4.11.0"
@@ -2671,4 +2683,4 @@ viz = ["matplotlib", "nc-time-axis", "seaborn"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10, <3.13"
-content-hash = "3682b1d3e4c4deae125e670f718deb49af4686d36d38fe4d38e18da86f49100c"
+content-hash = "7dcea9087ab369daf1a2661541b1add72c879d396fda8b2115a24c999356a822"
diff --git a/pyproject.toml b/pyproject.toml
index 0e4ff6f..2745a11 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,7 @@ python = ">=3.10, <3.13"
 click = "*"
 numpy = "^1.26.4"
 pandas = "^2.2.2"
-rasterra = "^0.5.9"
+rasterra = "^0.5.10"
 shapely = "^2.0.4"
 geopandas = "^0.14.4"
 xarray = "^2024.3.0"
@@ -45,6 +45,7 @@ rra-tools = "^1.0.6"
 netcdf4 = "^1.6.5"
 pyarrow = "^16.0.0"
 types-requests = "^2.31.0.20240406"
+types-tqdm = "^4.66.0.20240417"
 
 [tool.poetry.group.dev.dependencies]
 mkdocstrings = {version = ">=0.23", extras = ["python"]}
@@ -90,6 +91,7 @@ ignore = [
     "RUF007",   # zip is idiomatic, this is a dumb check
     "RET505",   # Else after return, makes a lot of false positives
     "E501",     # Line too long, this is autoformatted
+    "PYI041",   # Use float instead of int | float; dumb rule
 ]
 
 [tool.ruff.lint.per-file-ignores]
@@ -142,6 +144,7 @@ exclude = [
  [[tool.mypy.overrides]]
  module = [
      "cdsapi.*",
+     "affine.*",
  ]
  ignore_missing_imports = true
 
diff --git a/src/climate_downscale/cli.py b/src/climate_downscale/cli.py
index 28a009e..b962d54 100644
--- a/src/climate_downscale/cli.py
+++ b/src/climate_downscale/cli.py
@@ -1,6 +1,6 @@
 import click
 
-from climate_downscale import extract
+from climate_downscale import extract, model
 
 
 @click.group()
@@ -13,7 +13,7 @@ def cdtask() -> None:
     """Entry point for running climate downscale tasks."""
 
 
-for module in [extract]:
+for module in [extract, model]:
     runners = getattr(module, "RUNNERS", {})
     task_runners = getattr(module, "TASK_RUNNERS", {})
 
diff --git a/src/climate_downscale/data.py b/src/climate_downscale/data.py
index 60c6ec0..06f9db5 100644
--- a/src/climate_downscale/data.py
+++ b/src/climate_downscale/data.py
@@ -1,4 +1,7 @@
 from pathlib import Path
+from typing import Any
+
+import rasterra as rt
 
 DEFAULT_ROOT = "/mnt/share/erf/climate_downscale/"
 
@@ -39,3 +42,51 @@ def open_topography_elevation(self) -> Path:
     @property
     def rub_local_climate_zones(self) -> Path:
         return self.extracted_data / "rub_local_climate_zones"
+
+    @property
+    def model(self) -> Path:
+        return self.root / "model"
+
+    @property
+    def predictors(self) -> Path:
+        return self.model / "predictors"
+
+    def save_predictor(self, predictor: rt.RasterArray, name: str) -> None:
+        save_raster(predictor, self.predictors / f"{name}.tif")
+
+    def load_predictor(self, name: str) -> rt.RasterArray:
+        return rt.load_raster(self.predictors / f"{name}.tif")
+
+
+def save_raster(
+    raster: rt.RasterArray,
+    output_path: str | Path,
+    num_cores: int = 1,
+    **kwargs: Any,
+) -> None:
+    """Save a raster to a file with standard parameters."""
+    save_params = {
+        "tiled": True,
+        "blockxsize": 512,
+        "blockysize": 512,
+        "compress": "ZSTD",
+        "predictor": 2,  # horizontal differencing
+        "num_threads": num_cores,
+        "bigtiff": "yes",
+        **kwargs,
+    }
+    raster.to_file(output_path, **save_params)
+
+
+def save_raster_to_cog(
+    raster: rt.RasterArray,
+    output_path: str | Path,
+    num_cores: int = 1,
+    resampling: str = "nearest",
+) -> None:
+    """Save a raster to a COG file."""
+    cog_save_params = {
+        "driver": "COG",
+        "overview_resampling": resampling,
+    }
+    save_raster(raster, output_path, num_cores, **cog_save_params)
diff --git a/src/climate_downscale/model/__init__.py b/src/climate_downscale/model/__init__.py
new file mode 100644
index 0000000..81a96c6
--- /dev/null
+++ b/src/climate_downscale/model/__init__.py
@@ -0,0 +1,12 @@
+from climate_downscale.model.prepare_predictors import (
+    prepare_predictors,
+    prepare_predictors_task,
+)
+
+RUNNERS = {
+    "prepare_predictors": prepare_predictors,
+}
+
+TASK_RUNNERS = {
+    "prepare_predictors": prepare_predictors_task,
+}
diff --git a/src/climate_downscale/model/prepare_predictors.py b/src/climate_downscale/model/prepare_predictors.py
index 910c572..4cd20dc 100644
--- a/src/climate_downscale/model/prepare_predictors.py
+++ b/src/climate_downscale/model/prepare_predictors.py
@@ -1,94 +1,118 @@
-import geopandas as gpd
-import pandas as pd
-import xarray as xr
-import tqdm
-import matplotlib.pyplot as plt
-import rasterra as rt
-from affine import Affine
-import numpy as np
+from collections.abc import Sequence
 from pathlib import Path
 
-def to_raster(ds, nodata, lat_col='lat', lon_col='lon'):
-    lat, lon = ds[lat_col].data, ds[lon_col].data
+import click
+import rasterra as rt
+from rra_tools import jobmon
+from rra_tools.cli_tools import (
+    with_choice,
+    with_output_directory,
+    with_queue,
+)
 
-    dlat = (lat[1:] - lat[:-1]).mean()
-    dlon = (lon[1:] - lon[:-1]).mean()
+from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
+from climate_downscale.utils import make_raster_template
 
-    transform = Affine(
-        a=dlon,
-        b=0.,
-        c=lon[0],
-        d=0.,
-        e=-dlat,
-        f=lat[-1],
-    )
-    raster = rt.RasterArray(
-        data = ds.data,
-        transform=transform,
-        crs='EPSG:4326',
-        no_data_value=nodata,
+# Degrees
+
+STRIDE = 30
+LATITUDES = [str(lat) for lat in range(-90, 90, STRIDE)]
+LONGITUDES = [str(lon) for lon in range(-180, 180, STRIDE)]
+PAD = 1
+
+
+def load_elevation(
+    cd_data: ClimateDownscaleData,
+    latitudes: Sequence[int],
+    longitudes: Sequence[int],
+) -> rt.RasterArray:
+    data_root = cd_data.open_topography_elevation / "SRTM_GL3_srtm"
+    paths = []
+    for lon in longitudes:
+        lon_stub = f"E{lon:03}" if lon >= 0 else f"W{-lon:03}"
+        for lat in latitudes:
+            if lat >= 30:  # noqa: PLR2004
+                rel_path = f"North/North_30_60/N{lat:02}{lon_stub}.tif"
+            elif lat >= 0:
+                rel_path = f"North/North_0_29/N{lat:02}{lon_stub}.tif"
+            else:
+                rel_path = f"South/S{-lat:02}{lon_stub}.tif"
+
+            p = data_root / rel_path
+
+            if p.exists():
+                paths.append(p)
+
+    return rt.load_mf_raster(paths)
+
+
+def prepare_predictors_main(
+    output_dir: str | Path, lat_start: int, lon_start: int
+) -> None:
+    cd_data = ClimateDownscaleData(output_dir)
+    predictors = {}
+
+    longitudes = range(lon_start - PAD, lon_start + STRIDE + PAD)
+    latitudes = range(lat_start - PAD, lat_start + STRIDE + PAD)
+
+    # Make upscale templates, one at ERA5 resolution and one at the target
+    # resolution for the predictors
+    template_era5 = make_raster_template(
+        x_min=lon_start,
+        y_min=lat_start,
+        stride=STRIDE,
+        resolution=0.1,
     )
-    return raster
-    
-def make_template(x_min, y_min, stride, resolution):
-    evenly_divides = (
-        (stride % resolution < 1e-12)
-        or (resolution - stride % resolution < 1e-12)
+    template_target = make_raster_template(
+        x_min=lon_start,
+        y_min=lat_start,
+        stride=STRIDE,
+        resolution=0.01,
     )
-    assert evenly_divides
-    
-    transform = Affine(
-        a=resolution,
-        b=0,
-        c=x_min,
-        d=0,
-        e=-resolution,
-        f=y_min + stride,
+
+    elevation = load_elevation(cd_data, latitudes, longitudes)
+
+    predictors["elevation_target"] = elevation.resample_to(
+        template_target, resampling="average"
     )
-    
-    n_pix = int(stride / resolution)
-    
-    data = np.zeros((n_pix, n_pix), dtype=np.int8)
-    template = rt.RasterArray(
-        data,
-        transform,
-        crs='EPSG:4326',
-        no_data_value=-1,
+    predictors["elevation_era5"] = elevation.resample_to(
+        template_era5, resampling="average"
+    ).resample_to(template_target, resampling="nearest")
+    predictors["elevation_anomaly"] = (
+        predictors["elevation_era5"] - predictors["elevation_target"]
     )
-    return template
 
+    for name, predictor in predictors.items():
+        cd_data.save_predictor(predictor, f"{name}_{lat_start}_{lon_start}")
 
-STRIDE = 30  # degrees
-PAD = 1
-lat_start = 0
-lon_start = 0
-
-longitudes = range(lon_start - PAD, lon_start + STRIDE + PAD)
-latitudes = range(lat_start - PAD, lat_start + STRIDE + PAD)
-
-template_era5 = make_template(x_min=lon_start, y_min=lat_start, stride=STRIDE, resolution=0.1)
-template_target = make_template(x_min=lon_start, y_min=lat_start, stride=STRIDE, resolution=0.01)
-
-root = Path("/mnt/share/erf/climate_downscale/extracted_data/open_topography_elevation/SRTM_GL3_srtm")
-paths = []
-for lon in longitudes:
-    lon_stub = f"E{lon:03}" if lon >= 0 else f"W{-lon:03}"
-        
-    for lat in range(lat_start, lat_start+STRIDE): 
-        if lat >= 30:
-            rel_path = f"North/North_30_60/N{lat:02}{lon_stub}.tif"
-        elif lat >=0:
-            rel_path = f"North/North_0_29/N{lat:02}{lon_stub}.tif"
-        else:
-            rel_path = f"South/S{-lat:02}{lon_stub}.tif"
-        
-        p = root / rel_path
-
-        if p.exists():
-            paths.append(p)
-
-elevation = rt.load_mf_raster(paths)
-
-elevation_target = elevation.resample_to(template_target, resampling='average')
-elevation_era5 = elevation.resample_to(template_era5, resampling='average').resample_to(template_target, resampling='nearest')
-elevation_anomaly = elevation_era5 - elevation_target
\ No newline at end of file
+
+@click.command()  # type: ignore[arg-type]
+@with_choice("lat_start", allow_all=True, choices=LATITUDES)
+@with_choice("lon_start", allow_all=True, choices=LONGITUDES)
+@with_output_directory(DEFAULT_ROOT)
+def prepare_predictors_task(
+    lat_start: int, lon_start: int, output_dir: str | Path
+) -> None:
+    prepare_predictors_main(output_dir, lat_start, lon_start)
+
+
+@click.command()  # type: ignore[arg-type]
+@with_output_directory(DEFAULT_ROOT)
+@with_queue()
+def prepare_predictors(output_dir: str, queue: str) -> None:
+    jobmon.run_parallel(
+        "model prepare_predictors",
+        node_args={
+            "output-dir": [output_dir],
+            "lat_start": LATITUDES,
+            "lon_start": LONGITUDES,
+        },
+        task_resources={
+            "queue": queue,
+            "cores": 1,
+            "memory": "10G",
+            "runtime": "45m",
+            "project": "proj_rapidresponse",
+        },
+        runner="cdtask",
+    )
diff --git a/src/climate_downscale/model/prepare_training_data.py b/src/climate_downscale/model/prepare_training_data.py
index 6479c7c..bd29904 100644
--- a/src/climate_downscale/model/prepare_training_data.py
+++ b/src/climate_downscale/model/prepare_training_data.py
@@ -1,24 +1,35 @@
+import pandas as pd
+import xarray as xr
+
 year = 2023
 
-climate_stations = pd.read_parquet(f'/mnt/share/erf/climate_downscale/extracted_data/ncei_climate_stations/{year}.parquet')
+climate_stations = pd.read_parquet(
+    f"/mnt/share/erf/climate_downscale/extracted_data/ncei_climate_stations/{year}.parquet"
+)
 column_map = {
     "DATE": "date",
     "LATITUDE": "lat",
     "LONGITUDE": "lon",
     "TEMP": "temperature",
 }
-climate_stations = climate_stations.rename(columns=column_map).loc[:, list(column_map.values())].dropna()
-climate_stations['date'] = pd.to_datetime(climate_stations['date'])
-climate_stations['year'] = climate_stations['date'].dt.year
-climate_stations['dayofyear'] = climate_stations['date'].dt.dayofyear
-climate_stations['temperature'] = 5/9 * (climate_stations['temperature'] - 32)
-climate_stations.loc[climate_stations.lon < 0, 'lon'] +=360
+climate_stations = (
+    climate_stations.rename(columns=column_map)
+    .loc[:, list(column_map.values())]
+    .dropna()
+)
+climate_stations["date"] = pd.to_datetime(climate_stations["date"])
+climate_stations["year"] = climate_stations["date"].dt.year
+climate_stations["dayofyear"] = climate_stations["date"].dt.dayofyear
+climate_stations["temperature"] = 5 / 9 * (climate_stations["temperature"] - 32)
+climate_stations.loc[climate_stations.lon < 0, "lon"] += 360
 
-era5 = xr.load_dataset(f'/mnt/share/erf/climate_downscale/extracted_data/era5_temperature_daily_mean/{year}_era5_temp_daily.nc')
-lat = xr.DataArray(climate_stations['lat'].values, dims=['points'])
-lon = xr.DataArray(climate_stations['lon'].values, dims=['points'])
-time = xr.DataArray(climate_stations['date'].values, dims=['points'])
-arr = era5.sel(latitude=lat, longitude=lon, time=time, method='nearest')
+era5 = xr.load_dataset(
+    f"/mnt/share/erf/climate_downscale/extracted_data/era5_temperature_daily_mean/{year}_era5_temp_daily.nc"
+)
+lat = xr.DataArray(climate_stations["lat"].values, dims=["points"])
+lon = xr.DataArray(climate_stations["lon"].values, dims=["points"])
+time = xr.DataArray(climate_stations["date"].values, dims=["points"])
+arr = era5.sel(latitude=lat, longitude=lon, time=time, method="nearest")
 if "expver" in arr.coords:
     arr = arr.sel(expver=1).combine_first(arr.sel(expver=5))
-climate_stations['era5_temperature'] = arr['t2m'].to_numpy() + 273.15
\ No newline at end of file
+climate_stations["era5_temperature"] = arr["t2m"].to_numpy() + 273.15
diff --git a/src/climate_downscale/utils.py b/src/climate_downscale/utils.py
new file mode 100644
index 0000000..ff395ef
--- /dev/null
+++ b/src/climate_downscale/utils.py
@@ -0,0 +1,69 @@
+import numpy as np
+import rasterra as rt
+import xarray as xr
+from affine import Affine
+
+
+def to_raster(
+    ds: xr.DataArray,
+    nodata: float | int,
+    lat_col: str = "lat",
+    lon_col: str = "lon",
+    crs: str = "EPSG:4326",
+) -> rt.RasterArray:
+    """Convert an xarray DataArray to a RasterArray."""
+    lat, lon = ds[lat_col].data, ds[lon_col].data
+
+    dlat = (lat[1:] - lat[:-1]).mean()
+    dlon = (lon[1:] - lon[:-1]).mean()
+
+    transform = Affine(
+        a=dlon,
+        b=0.0,
+        c=lon[0],
+        d=0.0,
+        e=-dlat,
+        f=lat[-1],
+    )
+    return rt.RasterArray(
+        data=ds.data[::-1],
+        transform=transform,
+        crs=crs,
+        no_data_value=nodata,
+    )
+
+
+def make_raster_template(
+    x_min: int | float,
+    y_min: int | float,
+    stride: int | float,
+    resolution: int | float,
+    crs: str = "EPSG:4326",
+) -> rt.RasterArray:
+    """Create a raster template with the specified dimensions and resolution."""
+    tolerance = 1e-12
+    evenly_divides = (stride % resolution < tolerance) or (
+        resolution - stride % resolution < tolerance
+    )
+    if not evenly_divides:
+        msg = "Stride must be a multiple of resolution"
+        raise ValueError(msg)
+
+    transform = Affine(
+        a=resolution,
+        b=0,
+        c=x_min,
+        d=0,
+        e=-resolution,
+        f=y_min + stride,
+    )
+
+    n_pix = int(stride / resolution)
+
+    data = np.zeros((n_pix, n_pix), dtype=np.int8)
+    return rt.RasterArray(
+        data,
+        transform,
+        crs=crs,
+        no_data_value=-1,
+    )

From dc3f8235939db1e7a0fa408af3ca377d366c29d9 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Wed, 15 May 2024 15:56:28 -0700
Subject: [PATCH 03/71] Fix command line stuff

---
 src/climate_downscale/model/prepare_predictors.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/climate_downscale/model/prepare_predictors.py b/src/climate_downscale/model/prepare_predictors.py
index 4cd20dc..609978d 100644
--- a/src/climate_downscale/model/prepare_predictors.py
+++ b/src/climate_downscale/model/prepare_predictors.py
@@ -87,13 +87,13 @@ def prepare_predictors_main(
 
 
 @click.command()  # type: ignore[arg-type]
-@with_choice("lat_start", allow_all=True, choices=LATITUDES)
-@with_choice("lon_start", allow_all=True, choices=LONGITUDES)
+@with_choice("lat-start", allow_all=False, choices=LATITUDES)
+@with_choice("lon-start", allow_all=False, choices=LONGITUDES)
 @with_output_directory(DEFAULT_ROOT)
 def prepare_predictors_task(
-    lat_start: int, lon_start: int, output_dir: str | Path
+    lat_start: str, lon_start: str, output_dir: str,
 ) -> None:
-    prepare_predictors_main(output_dir, lat_start, lon_start)
+    prepare_predictors_main(output_dir, int(lat_start), int(lon_start))
 
 
 @click.command()  # type: ignore[arg-type]
@@ -104,8 +104,8 @@ def prepare_predictors(output_dir: str, queue: str) -> None:
         "model prepare_predictors",
         node_args={
             "output-dir": [output_dir],
-            "lat_start": LATITUDES,
-            "lon_start": LONGITUDES,
+            "lat-start": LATITUDES,
+            "lon-start": LONGITUDES,
         },
         task_resources={
             "queue": queue,

From ee93188f5f85e8c7d10e31f03a139cc6d6dd0e83 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Wed, 15 May 2024 15:59:35 -0700
Subject: [PATCH 04/71] Add option when there are not paths

---
 .../model/prepare_predictors.py               | 25 ++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/src/climate_downscale/model/prepare_predictors.py b/src/climate_downscale/model/prepare_predictors.py
index 609978d..82c7cd1 100644
--- a/src/climate_downscale/model/prepare_predictors.py
+++ b/src/climate_downscale/model/prepare_predictors.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 
 import click
+import numpy as np
 import rasterra as rt
 from rra_tools import jobmon
 from rra_tools.cli_tools import (
@@ -42,8 +43,24 @@ def load_elevation(
 
             if p.exists():
                 paths.append(p)
-
-    return rt.load_mf_raster(paths)
+    if paths:
+        raster = rt.load_mf_raster(paths)
+    else:
+        template = make_raster_template(
+            x_min=longitudes[0],
+            y_min=latitudes[0],
+            stride=STRIDE,
+            resolution=0.1,
+        )
+        no_data = -32768
+        arr = np.full((len(latitudes), len(longitudes)), no_data, dtype=np.int16)
+        raster = rt.RasterArray(
+            data=arr,
+            transform=template.transform,
+            crs=template.crs,
+            no_data_value=-32768,
+        )
+    return raster
 
 
 def prepare_predictors_main(
@@ -91,7 +108,9 @@ def prepare_predictors_main(
 @with_choice("lon-start", allow_all=False, choices=LONGITUDES)
 @with_output_directory(DEFAULT_ROOT)
 def prepare_predictors_task(
-    lat_start: str, lon_start: str, output_dir: str,
+    lat_start: str,
+    lon_start: str,
+    output_dir: str,
 ) -> None:
     prepare_predictors_main(output_dir, int(lat_start), int(lon_start))
 

From faaaa54cd74bc6d5e34c304eff78a50590dc064d Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Wed, 15 May 2024 17:35:46 -0700
Subject: [PATCH 05/71] Add lcz

---
 src/climate_downscale/model/prepare_predictors.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/climate_downscale/model/prepare_predictors.py b/src/climate_downscale/model/prepare_predictors.py
index 82c7cd1..e71f95f 100644
--- a/src/climate_downscale/model/prepare_predictors.py
+++ b/src/climate_downscale/model/prepare_predictors.py
@@ -63,6 +63,12 @@ def load_elevation(
     return raster
 
 
+def load_lcz_data(cd_data, latitudes, longitudes):
+    path = cd_data.rub_local_climate_zones / 'lcz_filter_v2.tif'
+    bounds = (longitudes[0], latitudes[0], longitudes[1], latitudes[1])
+    return rt.load_raster(path, bounds=bounds)
+
+
 def prepare_predictors_main(
     output_dir: str | Path, lat_start: int, lon_start: int
 ) -> None:
@@ -88,6 +94,7 @@ def prepare_predictors_main(
     )
 
     elevation = load_elevation(cd_data, latitudes, longitudes)
+    lcz = load_lcz_data(cd_data, latitudes, longitudes)
 
     predictors["elevation_target"] = elevation.resample_to(
         template_target, resampling="average"
@@ -98,6 +105,8 @@ def prepare_predictors_main(
     predictors["elevation_anomaly"] = (
         predictors["elevation_era5"] - predictors["elevation_target"]
     )
+    predictors["lcz_era5"] = lcz.resample_to(template_era5, resampling="mode")
+    predictors["lcz_target"] = lcz.resample_to(template.target, resampling="mode")
 
     for name, predictor in predictors.items():
         cd_data.save_predictor(predictor, f"{name}_{lat_start}_{lon_start}")

From f0d160b35dea92cbd69fea0ced09fc30ec2e4745 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Wed, 15 May 2024 18:06:35 -0700
Subject: [PATCH 06/71] Bugfixes to get lcz working

---
 src/climate_downscale/model/prepare_predictors.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/climate_downscale/model/prepare_predictors.py b/src/climate_downscale/model/prepare_predictors.py
index e71f95f..1335df5 100644
--- a/src/climate_downscale/model/prepare_predictors.py
+++ b/src/climate_downscale/model/prepare_predictors.py
@@ -65,7 +65,7 @@ def load_elevation(
 
 def load_lcz_data(cd_data, latitudes, longitudes):
     path = cd_data.rub_local_climate_zones / 'lcz_filter_v2.tif'
-    bounds = (longitudes[0], latitudes[0], longitudes[1], latitudes[1])
+    bounds = (longitudes[0], latitudes[0], longitudes[-1], latitudes[-1])
     return rt.load_raster(path, bounds=bounds)
 
 
@@ -106,7 +106,7 @@ def prepare_predictors_main(
         predictors["elevation_era5"] - predictors["elevation_target"]
     )
     predictors["lcz_era5"] = lcz.resample_to(template_era5, resampling="mode")
-    predictors["lcz_target"] = lcz.resample_to(template.target, resampling="mode")
+    predictors["lcz_target"] = lcz.resample_to(template_target, resampling="mode")
 
     for name, predictor in predictors.items():
         cd_data.save_predictor(predictor, f"{name}_{lat_start}_{lon_start}")

From 37f40cb61320ddf32a3b52036bdd4980c77db095 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Thu, 16 May 2024 11:10:20 -0700
Subject: [PATCH 07/71] Prep training data

---
 .../model/prepare_training_data.py            | 56 +++++++++++++++----
 1 file changed, 45 insertions(+), 11 deletions(-)

diff --git a/src/climate_downscale/model/prepare_training_data.py b/src/climate_downscale/model/prepare_training_data.py
index bd29904..3058a58 100644
--- a/src/climate_downscale/model/prepare_training_data.py
+++ b/src/climate_downscale/model/prepare_training_data.py
@@ -1,8 +1,26 @@
 import pandas as pd
 import xarray as xr
+import rasterra as rt
+from pathlib import Path
+
+def get_era5_temperature(year: int | str, cs_df: pd.DataFrame):
+    lat = xr.DataArray(cs_df["lat"].values, dims=["points"])
+    lon = xr.DataArray(cs_df["lon"].values, dims=["points"])
+    time = xr.DataArray(cs_df["date"].values, dims=["points"])
+    
+    era5 = xr.load_dataset(
+        f"/mnt/share/erf/climate_downscale/extracted_data/era5_temperature_daily_mean/{year}_era5_temp_daily.nc"
+    )
+
+    era5 = era5.assign_coords(longitude=(((era5.longitude + 180) % 360) - 180)).sortby(['latitude', 'longitude'])
+    arr = era5.sel(latitude=lat, longitude=lon, time=time, method="nearest")
+    if "expver" in era5.coords:
+        arr = arr.sel(expver=1).combine_first(arr.sel(expver=5))
+    return arr['t2m'].to_numpy() - 273.15
 
 year = 2023
 
+# Load and cleanup
 climate_stations = pd.read_parquet(
     f"/mnt/share/erf/climate_downscale/extracted_data/ncei_climate_stations/{year}.parquet"
 )
@@ -11,25 +29,41 @@
     "LATITUDE": "lat",
     "LONGITUDE": "lon",
     "TEMP": "temperature",
+    "ELEVATION": "ncei_elevation",
 }
 climate_stations = (
     climate_stations.rename(columns=column_map)
     .loc[:, list(column_map.values())]
     .dropna()
+    .reset_index(drop=True)
 )
+
+# Do time things
 climate_stations["date"] = pd.to_datetime(climate_stations["date"])
 climate_stations["year"] = climate_stations["date"].dt.year
 climate_stations["dayofyear"] = climate_stations["date"].dt.dayofyear
+
+# Add temperature
 climate_stations["temperature"] = 5 / 9 * (climate_stations["temperature"] - 32)
-climate_stations.loc[climate_stations.lon < 0, "lon"] += 360
+climate_stations['era5_temperature'] = get_era5_temperature(year, climate_stations)
 
-era5 = xr.load_dataset(
-    f"/mnt/share/erf/climate_downscale/extracted_data/era5_temperature_daily_mean/{year}_era5_temp_daily.nc"
-)
-lat = xr.DataArray(climate_stations["lat"].values, dims=["points"])
-lon = xr.DataArray(climate_stations["lon"].values, dims=["points"])
-time = xr.DataArray(climate_stations["date"].values, dims=["points"])
-arr = era5.sel(latitude=lat, longitude=lon, time=time, method="nearest")
-if "expver" in arr.coords:
-    arr = arr.sel(expver=1).combine_first(arr.sel(expver=5))
-climate_stations["era5_temperature"] = arr["t2m"].to_numpy() + 273.15
+# Elevation pieces
+target_elevation = rt.load_mf_raster(list(Path("/mnt/share/erf/climate_downscale/model/predictors").glob("elevation_target_*.tif")))
+climate_stations['target_elevation'] = srtm_elevation.select(climate_stations['lon'], climate_stations['lat'])
+era5_elevation = rt.load_mf_raster(list(Path("/mnt/share/erf/climate_downscale/model/predictors").glob("elevation_era5_*.tif")))
+climate_stations['era5_elevation'] = era5_elevation.select(climate_stations['lon'], climate_stations['lat'])
+
+climate_stations['elevation'] = climate_stations['ncei_elevation']
+missing_elevation = climate_stations['elevation'] < -999
+
+climate_stations['elevation'] = climate_stations['ncei_elevation']
+missing_elevation = climate_stations['elevation'] < -999
+climate_stations.loc[missing_elevation, 'elevation'] = climate_stations.loc[missing_elevation, 'target_elevation']
+still_missing_elevation = climate_stations['elevation'] < -999
+climate_stations = climate_stations.loc[~still_missing_elevation]
+
+# Local climate zone
+target_lcz = rt.load_mf_raster(list(Path("/mnt/share/erf/climate_downscale/model/predictors").glob("lcz_target_*.tif")))
+climate_stations['target_lcz'] = target_lcz.select(climate_stations['lon'], climate_stations['lat'])
+era5_lcz = rt.load_mf_raster(list(Path("/mnt/share/erf/climate_downscale/model/predictors").glob("lcz_era5_*.tif")))
+climate_stations['era5_lcz'] = era5_lcz.select(climate_stations['lon'], climate_stations['lat'])
\ No newline at end of file

From a74bbb5c504b5c71f1af07439f67e00249339919 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Thu, 16 May 2024 11:24:22 -0700
Subject: [PATCH 08/71] lcz extraction task

---
 poetry.lock                                   |  8 +-
 pyproject.toml                                |  3 +-
 .../extract/rub_local_climate_zones.py        | 11 ++-
 .../model/prepare_predictors.py               |  6 +-
 .../model/prepare_training_data.py            | 77 +++++++++++++------
 5 files changed, 72 insertions(+), 33 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index e003253..310bfd0 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2069,13 +2069,13 @@ test = ["boto3 (>=1.2.4)", "hypothesis", "packaging", "pytest (>=2.8.2)", "pytes
 
 [[package]]
 name = "rasterra"
-version = "0.5.10"
+version = "0.5.11"
 description = "A sleek, object-oriented interface designed for intuitive raster data manipulation in Python."
 optional = false
 python-versions = "<3.13,>=3.10"
 files = [
-    {file = "rasterra-0.5.10-py3-none-any.whl", hash = "sha256:9d281f98e4cb6375a12a7b09cc61dcb8a07c53ecab12e62c7147254ae04307b6"},
-    {file = "rasterra-0.5.10.tar.gz", hash = "sha256:6080156dc8395c7ba427d518ad5d8cbc4fe583ca372b281819807a4d0aacd543"},
+    {file = "rasterra-0.5.11-py3-none-any.whl", hash = "sha256:85344ec80b38c6aa0d337fb564d98f69dc5cd3f7699162e5016b65cf2a33f8ad"},
+    {file = "rasterra-0.5.11.tar.gz", hash = "sha256:8f7e396e50ec0b8c7e7b2bac5e01bd865ebcd2f8757f72ab82daf7c73723e5d9"},
 ]
 
 [package.dependencies]
@@ -2683,4 +2683,4 @@ viz = ["matplotlib", "nc-time-axis", "seaborn"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10, <3.13"
-content-hash = "7dcea9087ab369daf1a2661541b1add72c879d396fda8b2115a24c999356a822"
+content-hash = "76b81344dbd944abdc6006c3ce2e8a8ce7ae3131f747247396f1fb01ab80e1a3"
diff --git a/pyproject.toml b/pyproject.toml
index 2745a11..79a58f4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,7 @@ python = ">=3.10, <3.13"
 click = "*"
 numpy = "^1.26.4"
 pandas = "^2.2.2"
-rasterra = "^0.5.10"
+rasterra = "^0.5.11"
 shapely = "^2.0.4"
 geopandas = "^0.14.4"
 xarray = "^2024.3.0"
@@ -92,6 +92,7 @@ ignore = [
     "RET505",   # Else after return, makes a lot of false positives
     "E501",     # Line too long, this is autoformatted
     "PYI041",   # Use float instead of int | float; dumb rule
+    "T201",     # print is fine for now.
 ]
 
 [tool.ruff.lint.per-file-ignores]
diff --git a/src/climate_downscale/extract/rub_local_climate_zones.py b/src/climate_downscale/extract/rub_local_climate_zones.py
index 172a3e7..8ce475e 100644
--- a/src/climate_downscale/extract/rub_local_climate_zones.py
+++ b/src/climate_downscale/extract/rub_local_climate_zones.py
@@ -1,9 +1,10 @@
 from pathlib import Path
 
 import click
+from rra_tools.cli_tools import with_output_directory
 from rra_tools.shell_tools import wget
 
-from climate_downscale.data import ClimateDownscaleData
+from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
 
 URL_TEMPLATE = "https://zenodo.org/records/8419340/files/{file_name}?download=1"
 FILES = [
@@ -20,10 +21,12 @@ def extract_rub_local_climate_zones_main(output_dir: str | Path) -> None:
     out_root = data.rub_local_climate_zones
 
     for file_name in FILES:
+        print(f"Downloading {file_name}")
         url = URL_TEMPLATE.format(file_name=file_name)
         wget(url, out_root / file_name)
 
 
-@click.command()
-def extract_rub_local_climate_zones() -> None:
-    raise NotImplementedError
+@click.command()  # type: ignore[arg-type]
+@with_output_directory(DEFAULT_ROOT)
+def extract_rub_local_climate_zones(output_dir: str) -> None:
+    extract_rub_local_climate_zones_main(output_dir)
diff --git a/src/climate_downscale/model/prepare_predictors.py b/src/climate_downscale/model/prepare_predictors.py
index 1335df5..957bdb0 100644
--- a/src/climate_downscale/model/prepare_predictors.py
+++ b/src/climate_downscale/model/prepare_predictors.py
@@ -63,8 +63,10 @@ def load_elevation(
     return raster
 
 
-def load_lcz_data(cd_data, latitudes, longitudes):
-    path = cd_data.rub_local_climate_zones / 'lcz_filter_v2.tif'
+def load_lcz_data(
+    cd_data: ClimateDownscaleData, latitudes: Sequence[int], longitudes: Sequence[int]
+) -> rt.RasterArray:
+    path = cd_data.rub_local_climate_zones / "lcz_filter_v2.tif"
     bounds = (longitudes[0], latitudes[0], longitudes[-1], latitudes[-1])
     return rt.load_raster(path, bounds=bounds)
 
diff --git a/src/climate_downscale/model/prepare_training_data.py b/src/climate_downscale/model/prepare_training_data.py
index 3058a58..d891957 100644
--- a/src/climate_downscale/model/prepare_training_data.py
+++ b/src/climate_downscale/model/prepare_training_data.py
@@ -1,22 +1,31 @@
+from pathlib import Path
+
+import numpy as np
+import numpy.typing as npt
 import pandas as pd
-import xarray as xr
 import rasterra as rt
-from pathlib import Path
+import xarray as xr
 
-def get_era5_temperature(year: int | str, cs_df: pd.DataFrame):
+
+def get_era5_temperature(
+    year: int | str, cs_df: pd.DataFrame
+) -> npt.NDArray[np.float64]:
     lat = xr.DataArray(cs_df["lat"].values, dims=["points"])
     lon = xr.DataArray(cs_df["lon"].values, dims=["points"])
     time = xr.DataArray(cs_df["date"].values, dims=["points"])
-    
+
     era5 = xr.load_dataset(
         f"/mnt/share/erf/climate_downscale/extracted_data/era5_temperature_daily_mean/{year}_era5_temp_daily.nc"
     )
 
-    era5 = era5.assign_coords(longitude=(((era5.longitude + 180) % 360) - 180)).sortby(['latitude', 'longitude'])
+    era5 = era5.assign_coords(longitude=(((era5.longitude + 180) % 360) - 180)).sortby(
+        ["latitude", "longitude"]
+    )
     arr = era5.sel(latitude=lat, longitude=lon, time=time, method="nearest")
     if "expver" in era5.coords:
         arr = arr.sel(expver=1).combine_first(arr.sel(expver=5))
-    return arr['t2m'].to_numpy() - 273.15
+    return arr["t2m"].to_numpy() - 273.15
+
 
 year = 2023
 
@@ -45,25 +54,49 @@ def get_era5_temperature(year: int | str, cs_df: pd.DataFrame):
 
 # Add temperature
 climate_stations["temperature"] = 5 / 9 * (climate_stations["temperature"] - 32)
-climate_stations['era5_temperature'] = get_era5_temperature(year, climate_stations)
+climate_stations["era5_temperature"] = get_era5_temperature(year, climate_stations)
 
-# Elevation pieces
-target_elevation = rt.load_mf_raster(list(Path("/mnt/share/erf/climate_downscale/model/predictors").glob("elevation_target_*.tif")))
-climate_stations['target_elevation'] = srtm_elevation.select(climate_stations['lon'], climate_stations['lat'])
-era5_elevation = rt.load_mf_raster(list(Path("/mnt/share/erf/climate_downscale/model/predictors").glob("elevation_era5_*.tif")))
-climate_stations['era5_elevation'] = era5_elevation.select(climate_stations['lon'], climate_stations['lat'])
+lon, lat = climate_stations["lon"].to_numpy(), climate_stations["lat"].to_numpy()
 
-climate_stations['elevation'] = climate_stations['ncei_elevation']
-missing_elevation = climate_stations['elevation'] < -999
+# Elevation pieces
+target_elevation = rt.load_mf_raster(
+    list(
+        Path("/mnt/share/erf/climate_downscale/model/predictors").glob(
+            "elevation_target_*.tif"
+        )
+    )
+)
+climate_stations["target_elevation"] = target_elevation.select(lon, lat)
+era5_elevation = rt.load_mf_raster(
+    list(
+        Path("/mnt/share/erf/climate_downscale/model/predictors").glob(
+            "elevation_era5_*.tif"
+        )
+    )
+)
+climate_stations["era5_elevation"] = era5_elevation.select(lon, lat)
 
-climate_stations['elevation'] = climate_stations['ncei_elevation']
-missing_elevation = climate_stations['elevation'] < -999
-climate_stations.loc[missing_elevation, 'elevation'] = climate_stations.loc[missing_elevation, 'target_elevation']
-still_missing_elevation = climate_stations['elevation'] < -999
+climate_stations["elevation"] = climate_stations["ncei_elevation"]
+nodata_val = -999
+missing_elevation = climate_stations["elevation"] < nodata_val
+climate_stations.loc[missing_elevation, "elevation"] = climate_stations.loc[
+    missing_elevation, "target_elevation"
+]
+still_missing_elevation = climate_stations["elevation"] < nodata_val
 climate_stations = climate_stations.loc[~still_missing_elevation]
 
 # Local climate zone
-target_lcz = rt.load_mf_raster(list(Path("/mnt/share/erf/climate_downscale/model/predictors").glob("lcz_target_*.tif")))
-climate_stations['target_lcz'] = target_lcz.select(climate_stations['lon'], climate_stations['lat'])
-era5_lcz = rt.load_mf_raster(list(Path("/mnt/share/erf/climate_downscale/model/predictors").glob("lcz_era5_*.tif")))
-climate_stations['era5_lcz'] = era5_lcz.select(climate_stations['lon'], climate_stations['lat'])
\ No newline at end of file
+target_lcz = rt.load_mf_raster(
+    list(
+        Path("/mnt/share/erf/climate_downscale/model/predictors").glob(
+            "lcz_target_*.tif"
+        )
+    )
+)
+climate_stations["target_lcz"] = target_lcz.select(lon, lat)
+era5_lcz = rt.load_mf_raster(
+    list(
+        Path("/mnt/share/erf/climate_downscale/model/predictors").glob("lcz_era5_*.tif")
+    )
+)
+climate_stations["era5_lcz"] = era5_lcz.select(lon, lat)

From 06cd5d4ff64167e0b4fc8d41ee4e9160ceebe22b Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Thu, 16 May 2024 13:46:39 -0700
Subject: [PATCH 09/71] Prep training data script and clean up option usage

---
 src/climate_downscale/cli_options.py          | 120 +++++++++++
 src/climate_downscale/data.py                 |  43 +++-
 src/climate_downscale/extract/elevation.py    |  43 ++--
 src/climate_downscale/extract/era5.py         |  84 ++------
 .../extract/ncei_climate_stations.py          |   8 +-
 .../extract/rub_local_climate_zones.py        |   4 +-
 .../model/prepare_predictors.py               |  28 +--
 .../model/prepare_training_data.py            | 193 ++++++++++--------
 8 files changed, 325 insertions(+), 198 deletions(-)
 create mode 100644 src/climate_downscale/cli_options.py

diff --git a/src/climate_downscale/cli_options.py b/src/climate_downscale/cli_options.py
new file mode 100644
index 0000000..a1eeeec
--- /dev/null
+++ b/src/climate_downscale/cli_options.py
@@ -0,0 +1,120 @@
+from typing import ParamSpec, TypeVar
+
+from rra_tools.cli_tools import (
+    RUN_ALL,
+    ClickOption,
+    with_choice,
+    with_debugger,
+    with_input_directory,
+    with_num_cores,
+    with_output_directory,
+    with_progress_bar,
+    with_queue,
+    with_verbose,
+)
+
+_T = TypeVar("_T")
+_P = ParamSpec("_P")
+
+
+VALID_YEARS = [str(y) for y in range(1990, 2024)]
+
+
+def with_year(
+    *,
+    allow_all: bool = False,
+) -> ClickOption[_P, _T]:
+    return with_choice(
+        "year",
+        "y",
+        allow_all=allow_all,
+        choices=VALID_YEARS,
+        help="Year to extract data for.",
+    )
+
+
+VALID_MONTHS = [f"{i:02d}" for i in range(1, 13)]
+
+
+def with_month(
+    *,
+    allow_all: bool = False,
+) -> ClickOption[_P, _T]:
+    return with_choice(
+        "month",
+        "m",
+        allow_all=allow_all,
+        choices=VALID_MONTHS,
+        help="Month to extract data for.",
+    )
+
+
+VALID_CLIMATE_VARIABLES = [
+    "total_precipitation",
+    "2m_temperature",
+]
+
+
+def with_climate_variable(
+    *,
+    allow_all: bool = False,
+) -> ClickOption[_P, _T]:
+    return with_choice(
+        "climate-variable",
+        "x",
+        allow_all=allow_all,
+        choices=VALID_CLIMATE_VARIABLES,
+        help="Variable to extract.",
+    )
+
+
+STRIDE = 30
+LATITUDES = [str(lat) for lat in range(-90, 90, STRIDE)]
+LONGITUDES = [str(lon) for lon in range(-180, 180, STRIDE)]
+
+
+def with_lat_start(
+    *,
+    allow_all: bool = False,
+) -> ClickOption[_P, _T]:
+    return with_choice(
+        "lat-start",
+        allow_all=allow_all,
+        choices=LATITUDES,
+        help="Latitude of the top-left corner of the tile.",
+    )
+
+
+def with_lon_start(
+    *,
+    allow_all: bool = False,
+) -> ClickOption[_P, _T]:
+    return with_choice(
+        "lon-start",
+        allow_all=allow_all,
+        choices=LONGITUDES,
+        help="Longitude of the top-left corner of the tile.",
+    )
+
+
+__all__ = [
+    "VALID_YEARS",
+    "VALID_MONTHS",
+    "VALID_CLIMATE_VARIABLES",
+    "STRIDE",
+    "LATITUDES",
+    "LONGITUDES",
+    "with_year",
+    "with_month",
+    "with_climate_variable",
+    "with_lat_start",
+    "with_lon_start",
+    "with_output_directory",
+    "with_queue",
+    "with_verbose",
+    "with_debugger",
+    "with_input_directory",
+    "with_num_cores",
+    "with_progress_bar",
+    "RUN_ALL",
+]
diff --git a/src/climate_downscale/data.py b/src/climate_downscale/data.py
index 06f9db5..6ca01d7 100644
--- a/src/climate_downscale/data.py
+++ b/src/climate_downscale/data.py
@@ -1,7 +1,10 @@
 from pathlib import Path
 from typing import Any
 
+import pandas as pd
 import rasterra as rt
+import xarray as xr
+from rra_tools.shell_tools import touch
 
 DEFAULT_ROOT = "/mnt/share/erf/climate_downscale/"
 
@@ -27,10 +30,23 @@ def extracted_data(self) -> Path:
     def era5_temperature_daily_mean(self) -> Path:
         return self.extracted_data / "era5_temperature_daily_mean"
 
+    def load_era5_temperature_daily_mean(self, year: int | str) -> xr.Dataset:
+        return xr.load_dataset(
+            self.era5_temperature_daily_mean / f"{year}_era5_temp_daily.nc"
+        )
+
     @property
     def ncei_climate_stations(self) -> Path:
         return self.extracted_data / "ncei_climate_stations"
 
+    def save_ncei_climate_stations(self, df: pd.DataFrame, year: int | str) -> None:
+        path = self.ncei_climate_stations / f"{year}.parquet"
+        touch(path, exist_ok=True)
+        df.to_parquet(path)
+
+    def load_ncei_climate_stations(self, year: int | str) -> pd.DataFrame:
+        return pd.read_parquet(self.ncei_climate_stations / f"{year}.parquet")
+
     @property
     def srtm_elevation_gl1(self) -> Path:
         return self.extracted_data / "srtm_elevation_gl1"
@@ -51,11 +67,30 @@ def model(self) -> Path:
     def predictors(self) -> Path:
         return self.model / "predictors"
 
-    def save_predictor(self, predictor: rt.RasterArray, name: str) -> None:
-        save_raster(predictor, self.predictors / f"{name}.tif")
+    def save_predictor(
+        self,
+        predictor: rt.RasterArray,
+        name: str,
+        lat_start: int,
+        lon_start: int,
+    ) -> None:
+        save_raster(predictor, self.predictors / f"{name}_{lat_start}_{lon_start}.tif")
 
     def load_predictor(self, name: str) -> rt.RasterArray:
-        return rt.load_raster(self.predictors / f"{name}.tif")
+        paths = list(self.predictors.glob(f"{name}_*.tif"))
+        return rt.load_mf_raster(paths)
+
+    @property
+    def training_data(self) -> Path:
+        return self.model / "training_data"
+
+    def save_training_data(self, df: pd.DataFrame, year: int | str) -> None:
+        path = self.training_data / f"{year}.parquet"
+        touch(path, exist_ok=True)
+        df.to_parquet(path)
+
+    def load_training_data(self, year: int | str) -> pd.DataFrame:
+        return pd.read_parquet(self.training_data / f"{year}.parquet")
 
 
 def save_raster(
@@ -75,6 +110,7 @@ def save_raster(
         "bigtiff": "yes",
         **kwargs,
     }
+    touch(output_path, exist_ok=True)
     raster.to_file(output_path, **save_params)
 
 
@@ -89,4 +125,5 @@ def save_raster_to_cog(
         "driver": "COG",
         "overview_resampling": resampling,
     }
+    touch(output_path, exist_ok=True)
     save_raster(raster, output_path, num_cores, **cog_save_params)
diff --git a/src/climate_downscale/extract/elevation.py b/src/climate_downscale/extract/elevation.py
index 3c42845..5549886 100644
--- a/src/climate_downscale/extract/elevation.py
+++ b/src/climate_downscale/extract/elevation.py
@@ -2,27 +2,24 @@
 
 import click
 import requests
-from rra_tools import jobmon
-from rra_tools.cli_tools import (
-    with_output_directory,
-    with_queue,
-)
 import tqdm
+from rra_tools import jobmon
 
+from climate_downscale import cli_options as clio
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
 
 API_ENDPOINT = "https://portal.opentopography.org/API/globaldem"
 
 ELEVATION_MODELS = [
-    "SRTMGL3",     # SRTM Global 3 arc second (90m)
-    "SRTMGL1",     # SRTM Global 1 arc second (30m)
-    "SRTMGL1_E",   # SRTM Global 1 arc second ellipsoidal height (30m)
-    "AW3D30",      # ALOS World 3D 30m
-    "AW3D30_E",    # ALOS World 3D 30m ellipsoidal height
+    "SRTMGL3",  # SRTM Global 3 arc second (90m)
+    "SRTMGL1",  # SRTM Global 1 arc second (30m)
+    "SRTMGL1_E",  # SRTM Global 1 arc second ellipsoidal height (30m)
+    "AW3D30",  # ALOS World 3D 30m
+    "AW3D30_E",  # ALOS World 3D 30m ellipsoidal height
     "SRTM15Plus",  # SRTM 15 arc second (500m)
-    "NASADEM",     # NASA DEM 1 arc second (30m)
-    "COP30",       # Copernicus 1 arc second (30m)
-    "COP90",       # Copernicus 3 arc second (90m)
+    "NASADEM",  # NASA DEM 1 arc second (30m)
+    "COP30",  # Copernicus 1 arc second (30m)
+    "COP90",  # Copernicus 3 arc second (90m)
 ]
 
 FETCH_SIZE = 5  # degrees, should be small enough for any model
@@ -51,14 +48,16 @@ def extract_elevation_main(
     response = requests.get(API_ENDPOINT, params=params, stream=True, timeout=30)
     response.raise_for_status()
 
-    out_path = cd_data.open_topography_elevation / f"{model_name}_{lat_start}_{lon_start}.tif"
+    out_path = (
+        cd_data.open_topography_elevation / f"{model_name}_{lat_start}_{lon_start}.tif"
+    )
     with out_path.open("wb") as fp:
         for chunk in tqdm.tqdm(response.iter_content(chunk_size=64 * 1024**2)):
             fp.write(chunk)
 
 
 @click.command()  # type: ignore[arg-type]
-@with_output_directory(DEFAULT_ROOT)
+@clio.with_output_directory(DEFAULT_ROOT)
 @click.option(
     "--model-name",
     required=True,
@@ -84,24 +83,34 @@ def extract_elevation_task(
     lon_start: int,
 ) -> None:
     """Download elevation data from Open Topography."""
+    invalid = True
+    if invalid:
+        msg = "Downloaded using aws cli, this implementation is not valid"
+        raise NotImplementedError(msg)
+
     extract_elevation_main(output_dir, model_name, lat_start, lon_start)
 
 
 @click.command()  # type: ignore[arg-type]
-@with_output_directory(DEFAULT_ROOT)
+@clio.with_output_directory(DEFAULT_ROOT)
 @click.option(
     "--model-name",
     required=True,
     type=click.Choice(ELEVATION_MODELS),
     help="Name of the elevation model to download.",
 )
-@with_queue()
+@clio.with_queue()
 def extract_elevation(
     output_dir: str,
     model_name: str,
     queue: str,
 ) -> None:
     """Download elevation data from Open Topography."""
+    invalid = True
+    if invalid:
+        msg = "Downloaded using aws cli, this implementation is not valid"
+        raise NotImplementedError(msg)
+
     lat_starts = list(range(-90, 90, FETCH_SIZE))
     lon_starts = list(range(-180, 180, FETCH_SIZE))
 
diff --git a/src/climate_downscale/extract/era5.py b/src/climate_downscale/extract/era5.py
index 026448f..e27ad8e 100644
--- a/src/climate_downscale/extract/era5.py
+++ b/src/climate_downscale/extract/era5.py
@@ -1,68 +1,13 @@
 from pathlib import Path
-from typing import ParamSpec, TypeVar
 
 import cdsapi
 import click
 from rra_tools import jobmon
-from rra_tools.cli_tools import (
-    RUN_ALL,
-    ClickOption,
-    with_choice,
-    with_output_directory,
-    with_queue,
-)
+from rra_tools.shell_tools import touch
 
+from climate_downscale import cli_options as clio
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
 
-VALID_YEARS = [str(y) for y in range(1990, 2024)]
-VALID_MONTHS = [f"{i:02d}" for i in range(1, 13)]
-VALID_VARIABLES = [
-    "total_precipitation",
-    "2m_temperature",
-]
-
-_T = TypeVar("_T")
-_P = ParamSpec("_P")
-
-
-def with_year(
-    *,
-    allow_all: bool = False,
-) -> ClickOption[_P, _T]:
-    return with_choice(
-        "year",
-        "y",
-        allow_all=allow_all,
-        choices=VALID_YEARS,
-        help="Year to extract data for.",
-    )
-
-
-def with_month(
-    *,
-    allow_all: bool = False,
-) -> ClickOption[_P, _T]:
-    return with_choice(
-        "month",
-        "m",
-        allow_all=allow_all,
-        choices=VALID_MONTHS,
-        help="Month to extract data for.",
-    )
-
-
-def with_variable(
-    *,
-    allow_all: bool = False,
-) -> ClickOption[_P, _T]:
-    return with_choice(
-        "variable",
-        "x",
-        allow_all=allow_all,
-        choices=VALID_VARIABLES,
-        help="Variable to extract.",
-    )
-
 
 def extract_era5_main(
     output_dir: str | Path,
@@ -99,31 +44,32 @@ def extract_era5_main(
     )
 
     out_path = cddata.era5_temperature_daily_mean / f"{variable}_{year}_{month}.nc"
+    touch(out_path, exist_ok=True)
     copernicus.download(result, [out_path])
 
 
 @click.command()  # type: ignore[arg-type]
-@with_output_directory(DEFAULT_ROOT)
-@with_year()
-@with_month()
-@with_variable()
-def extract_era5_task(year: str, month: str, variable: str) -> None:
-    extract_era5_main(DEFAULT_ROOT, year, month, variable)
+@clio.with_output_directory(DEFAULT_ROOT)
+@clio.with_year()
+@clio.with_month()
+@clio.with_climate_variable()
+def extract_era5_task(year: str, month: str, climate_variable: str) -> None:
+    extract_era5_main(DEFAULT_ROOT, year, month, climate_variable)
 
 
 @click.command()  # type: ignore[arg-type]
-@with_output_directory(DEFAULT_ROOT)
-@with_year(allow_all=True)
-@with_variable(allow_all=True)
-@with_queue()
+@clio.with_output_directory(DEFAULT_ROOT)
+@clio.with_year(allow_all=True)
+@clio.with_climate_variable(allow_all=True)
+@clio.with_queue()
 def extract_era5(
     output_dir: str,
     year: str,
     variable: str,
     queue: str,
 ) -> None:
-    years = VALID_YEARS if year == RUN_ALL else [year]
-    variables = VALID_VARIABLES if variable == RUN_ALL else [variable]
+    years = clio.VALID_YEARS if year == clio.RUN_ALL else [year]
+    variables = clio.VALID_CLIMATE_VARIABLES if variable == clio.RUN_ALL else [variable]
 
     jobmon.run_parallel(
         task_name="extract_era5",
diff --git a/src/climate_downscale/extract/ncei_climate_stations.py b/src/climate_downscale/extract/ncei_climate_stations.py
index 43adeb0..048f6a8 100644
--- a/src/climate_downscale/extract/ncei_climate_stations.py
+++ b/src/climate_downscale/extract/ncei_climate_stations.py
@@ -5,7 +5,7 @@
 import pandas as pd
 from rra_tools import jobmon
 from rra_tools.cli_tools import with_choice, with_output_directory, with_queue
-from rra_tools.shell_tools import mkdir, touch, wget
+from rra_tools.shell_tools import mkdir, wget
 
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
 
@@ -29,9 +29,8 @@ def extract_ncei_climate_stations_main(output_dir: str | Path, year: str) -> Non
     shutil.unpack_archive(str(gz_path), year_dir)
 
     data = pd.concat([pd.read_csv(f) for f in year_dir.glob("*.csv")])
-    data['STATION'] = data['STATION'].astype(str)
-    out_path = cd_data.ncei_climate_stations / f"{year}.parquet"
-    data.to_parquet(out_path)
+    data["STATION"] = data["STATION"].astype(str)
+    cd_data.save_ncei_climate_stations(data, year)
 
     gz_path.unlink()
     shutil.rmtree(year_dir)
@@ -69,4 +68,3 @@ def extract_ncei_climate_stations(output_dir: str, queue: str) -> None:
         },
         runner="cdtask",
     )
-
diff --git a/src/climate_downscale/extract/rub_local_climate_zones.py b/src/climate_downscale/extract/rub_local_climate_zones.py
index 8ce475e..3296319 100644
--- a/src/climate_downscale/extract/rub_local_climate_zones.py
+++ b/src/climate_downscale/extract/rub_local_climate_zones.py
@@ -1,9 +1,9 @@
 from pathlib import Path
 
 import click
-from rra_tools.cli_tools import with_output_directory
 from rra_tools.shell_tools import wget
 
+from climate_downscale import cli_options as clio
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
 
 URL_TEMPLATE = "https://zenodo.org/records/8419340/files/{file_name}?download=1"
@@ -27,6 +27,6 @@ def extract_rub_local_climate_zones_main(output_dir: str | Path) -> None:
 
 
 @click.command()  # type: ignore[arg-type]
-@with_output_directory(DEFAULT_ROOT)
+@clio.with_output_directory(DEFAULT_ROOT)
 def extract_rub_local_climate_zones(output_dir: str) -> None:
     extract_rub_local_climate_zones_main(output_dir)
diff --git a/src/climate_downscale/model/prepare_predictors.py b/src/climate_downscale/model/prepare_predictors.py
index 957bdb0..a05a816 100644
--- a/src/climate_downscale/model/prepare_predictors.py
+++ b/src/climate_downscale/model/prepare_predictors.py
@@ -5,21 +5,13 @@
 import numpy as np
 import rasterra as rt
 from rra_tools import jobmon
-from rra_tools.cli_tools import (
-    with_choice,
-    with_output_directory,
-    with_queue,
-)
 
+from climate_downscale import cli_options as clio
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
 from climate_downscale.utils import make_raster_template
 
-# Degrees
-
-STRIDE = 30
-LATITUDES = [str(lat) for lat in range(-90, 90, STRIDE)]
-LONGITUDES = [str(lon) for lon in range(-180, 180, STRIDE)]
 PAD = 1
+STRIDE = clio.STRIDE
 
 
 def load_elevation(
@@ -111,13 +103,13 @@ def prepare_predictors_main(
     predictors["lcz_target"] = lcz.resample_to(template_target, resampling="mode")
 
     for name, predictor in predictors.items():
-        cd_data.save_predictor(predictor, f"{name}_{lat_start}_{lon_start}")
+        cd_data.save_predictor(predictor, name, lat_start, lon_start)
 
 
 @click.command()  # type: ignore[arg-type]
-@with_choice("lat-start", allow_all=False, choices=LATITUDES)
-@with_choice("lon-start", allow_all=False, choices=LONGITUDES)
-@with_output_directory(DEFAULT_ROOT)
+@clio.with_lat_start(allow_all=False)
+@clio.with_lon_start(allow_all=False)
+@clio.with_output_directory(DEFAULT_ROOT)
 def prepare_predictors_task(
     lat_start: str,
     lon_start: str,
@@ -127,15 +119,15 @@ def prepare_predictors_task(
 
 
 @click.command()  # type: ignore[arg-type]
-@with_output_directory(DEFAULT_ROOT)
-@with_queue()
+@clio.with_output_directory(DEFAULT_ROOT)
+@clio.with_queue()
 def prepare_predictors(output_dir: str, queue: str) -> None:
     jobmon.run_parallel(
         "model prepare_predictors",
         node_args={
             "output-dir": [output_dir],
-            "lat-start": LATITUDES,
-            "lon-start": LONGITUDES,
+            "lat-start": clio.LATITUDES,
+            "lon-start": clio.LONGITUDES,
         },
         task_resources={
             "queue": queue,
diff --git a/src/climate_downscale/model/prepare_training_data.py b/src/climate_downscale/model/prepare_training_data.py
index d891957..c11bc60 100644
--- a/src/climate_downscale/model/prepare_training_data.py
+++ b/src/climate_downscale/model/prepare_training_data.py
@@ -1,102 +1,127 @@
 from pathlib import Path
 
+import click
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
-import rasterra as rt
 import xarray as xr
+from rra_tools import jobmon
+
+from climate_downscale import cli_options as clio
+from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
+
+
+def load_and_clean_climate_stations(
+    cd_data: ClimateDownscaleData,
+    year: int | str,
+) -> pd.DataFrame:
+    climate_stations = cd_data.load_ncei_climate_stations(year)
+    column_map = {
+        "DATE": "date",
+        "LATITUDE": "lat",
+        "LONGITUDE": "lon",
+        "TEMP": "temperature",
+        "ELEVATION": "ncei_elevation",
+    }
+    climate_stations = (
+        climate_stations.rename(columns=column_map)
+        .loc[:, list(column_map.values())]
+        .dropna()
+        .reset_index(drop=True)
+        .assign(
+            date=lambda df: pd.to_datetime(df["date"]),
+            year=lambda df: df["date"].dt.year,
+            dayofyear=lambda df: df["date"].dt.dayofyear,
+            temperature=lambda df: 5 / 9 * (df["temperature"] - 32),
+        )
+    )
+    return climate_stations  # noqa: RET504
 
 
 def get_era5_temperature(
-    year: int | str, cs_df: pd.DataFrame
+    cd_data: ClimateDownscaleData,
+    year: int | str,
+    coords: dict[str, npt.NDArray[np.float64]],
 ) -> npt.NDArray[np.float64]:
-    lat = xr.DataArray(cs_df["lat"].values, dims=["points"])
-    lon = xr.DataArray(cs_df["lon"].values, dims=["points"])
-    time = xr.DataArray(cs_df["date"].values, dims=["points"])
+    lat = xr.DataArray(coords["lat"], dims=["points"])
+    lon = xr.DataArray(coords["lon"], dims=["points"])
+    time = xr.DataArray(coords["date"], dims=["points"])
 
-    era5 = xr.load_dataset(
-        f"/mnt/share/erf/climate_downscale/extracted_data/era5_temperature_daily_mean/{year}_era5_temp_daily.nc"
+    era5 = cd_data.load_era5_temperature_daily_mean(year)
+    era5 = (
+        era5.assign_coords(longitude=(((era5.longitude + 180) % 360) - 180))
+        .sortby(["latitude", "longitude"])
+        .sel(latitude=lat, longitude=lon, time=time, method="nearest")
     )
 
-    era5 = era5.assign_coords(longitude=(((era5.longitude + 180) % 360) - 180)).sortby(
-        ["latitude", "longitude"]
-    )
-    arr = era5.sel(latitude=lat, longitude=lon, time=time, method="nearest")
     if "expver" in era5.coords:
-        arr = arr.sel(expver=1).combine_first(arr.sel(expver=5))
-    return arr["t2m"].to_numpy() - 273.15
-
-
-year = 2023
-
-# Load and cleanup
-climate_stations = pd.read_parquet(
-    f"/mnt/share/erf/climate_downscale/extracted_data/ncei_climate_stations/{year}.parquet"
-)
-column_map = {
-    "DATE": "date",
-    "LATITUDE": "lat",
-    "LONGITUDE": "lon",
-    "TEMP": "temperature",
-    "ELEVATION": "ncei_elevation",
-}
-climate_stations = (
-    climate_stations.rename(columns=column_map)
-    .loc[:, list(column_map.values())]
-    .dropna()
-    .reset_index(drop=True)
-)
-
-# Do time things
-climate_stations["date"] = pd.to_datetime(climate_stations["date"])
-climate_stations["year"] = climate_stations["date"].dt.year
-climate_stations["dayofyear"] = climate_stations["date"].dt.dayofyear
-
-# Add temperature
-climate_stations["temperature"] = 5 / 9 * (climate_stations["temperature"] - 32)
-climate_stations["era5_temperature"] = get_era5_temperature(year, climate_stations)
-
-lon, lat = climate_stations["lon"].to_numpy(), climate_stations["lat"].to_numpy()
-
-# Elevation pieces
-target_elevation = rt.load_mf_raster(
-    list(
-        Path("/mnt/share/erf/climate_downscale/model/predictors").glob(
-            "elevation_target_*.tif"
-        )
+        era5 = era5.sel(expver=1).combine_first(era5.sel(expver=5))
+    return era5["t2m"].to_numpy() - 273.15
+
+
+def prepare_training_data_main(output_dir: str | Path, year: str) -> None:
+    cd_data = ClimateDownscaleData(output_dir)
+
+    data = load_and_clean_climate_stations(cd_data, year)
+    coords = {
+        "lon": data["lon"].to_numpy(),
+        "lat": data["lat"].to_numpy(),
+        "date": data["date"].to_numpy(),
+    }
+
+    data["era5_temperature"] = get_era5_temperature(cd_data, year, coords)
+
+    # Elevation pieces
+    data["target_elevation"] = cd_data.load_predictor("elevation_target").select(
+        coords["lon"], coords["lat"]
     )
-)
-climate_stations["target_elevation"] = target_elevation.select(lon, lat)
-era5_elevation = rt.load_mf_raster(
-    list(
-        Path("/mnt/share/erf/climate_downscale/model/predictors").glob(
-            "elevation_era5_*.tif"
-        )
+    data["era5_elevation"] = cd_data.load_predictor("elevation_era5").select(
+        coords["lon"], coords["lat"]
     )
-)
-climate_stations["era5_elevation"] = era5_elevation.select(lon, lat)
-
-climate_stations["elevation"] = climate_stations["ncei_elevation"]
-nodata_val = -999
-missing_elevation = climate_stations["elevation"] < nodata_val
-climate_stations.loc[missing_elevation, "elevation"] = climate_stations.loc[
-    missing_elevation, "target_elevation"
-]
-still_missing_elevation = climate_stations["elevation"] < nodata_val
-climate_stations = climate_stations.loc[~still_missing_elevation]
-
-# Local climate zone
-target_lcz = rt.load_mf_raster(
-    list(
-        Path("/mnt/share/erf/climate_downscale/model/predictors").glob(
-            "lcz_target_*.tif"
-        )
+
+    data["elevation"] = data["ncei_elevation"]
+    nodata_val = -999
+    missing_elevation = data["elevation"] < nodata_val
+    data.loc[missing_elevation, "elevation"] = data.loc[
+        missing_elevation, "target_elevation"
+    ]
+    still_missing_elevation = data["elevation"] < nodata_val
+    data = data.loc[~still_missing_elevation]
+
+    # Local climate zone
+    data["target_lcz"] = cd_data.load_predictor("lcz_target").select(
+        coords["lon"], coords["lat"]
+    )
+    data["era5_lcz"] = cd_data.load_predictor("lcz_era5").select(
+        coords["lon"], coords["lat"]
     )
-)
-climate_stations["target_lcz"] = target_lcz.select(lon, lat)
-era5_lcz = rt.load_mf_raster(
-    list(
-        Path("/mnt/share/erf/climate_downscale/model/predictors").glob("lcz_era5_*.tif")
+
+    cd_data.save_training_data(data, year)
+
+
+@click.command()  # type: ignore[arg-type]
+@clio.with_output_directory(DEFAULT_ROOT)
+@clio.with_year()
+def prepare_training_data_task(output_dir: str, year: str) -> None:
+    prepare_training_data_main(output_dir, year)
+
+
+@click.command()  # type: ignore[arg-type]
+@clio.with_output_directory(DEFAULT_ROOT)
+@clio.with_queue()
+def prepare_training_data(output_dir: str, queue: str) -> None:
+    jobmon.run_parallel(
+        "prepare training data",
+        node_args={
+            "output-dir": [output_dir],
+            "year": clio.VALID_YEARS,
+        },
+        task_resources={
+            "queue": queue,
+            "cores": 1,
+            "memory": "10G",
+            "runtime": "240m",
+            "project": "proj_rapidresponse",
+        },
+        runner="cdtask",
     )
-)
-climate_stations["era5_lcz"] = era5_lcz.select(lon, lat)

From 5846c7f8a58fb02dcc7b40f146342559a1252a56 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Thu, 16 May 2024 13:47:15 -0700
Subject: [PATCH 10/71] use lcz v3

---
 src/climate_downscale/model/prepare_predictors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/climate_downscale/model/prepare_predictors.py b/src/climate_downscale/model/prepare_predictors.py
index a05a816..f710ace 100644
--- a/src/climate_downscale/model/prepare_predictors.py
+++ b/src/climate_downscale/model/prepare_predictors.py
@@ -58,7 +58,7 @@ def load_elevation(
 def load_lcz_data(
     cd_data: ClimateDownscaleData, latitudes: Sequence[int], longitudes: Sequence[int]
 ) -> rt.RasterArray:
-    path = cd_data.rub_local_climate_zones / "lcz_filter_v2.tif"
+    path = cd_data.rub_local_climate_zones / "lcz_filter_v3.tif"
     bounds = (longitudes[0], latitudes[0], longitudes[-1], latitudes[-1])
     return rt.load_raster(path, bounds=bounds)
 

From 4b43fb79e4bd90cc3a0932912d3b8e7f3ac08d37 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Thu, 16 May 2024 13:49:53 -0700
Subject: [PATCH 11/71] Add prep training data

---
 src/climate_downscale/model/__init__.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/climate_downscale/model/__init__.py b/src/climate_downscale/model/__init__.py
index 81a96c6..5449577 100644
--- a/src/climate_downscale/model/__init__.py
+++ b/src/climate_downscale/model/__init__.py
@@ -2,11 +2,17 @@
     prepare_predictors,
     prepare_predictors_task,
 )
+from climate_downscale.model.prepare_training_data import (
+    prepare_training_data,
+    prepare_training_data_task,
+)
 
 RUNNERS = {
     "prepare_predictors": prepare_predictors,
+    "prepare_training_data": prepare_training_data,
 }
 
 TASK_RUNNERS = {
     "prepare_predictors": prepare_predictors_task,
+    "prepare_training_data": prepare_training_data_task,
 }

From 79849a0e96534a86719283f4d9c2f88c1a93cf10 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Thu, 16 May 2024 15:16:01 -0700
Subject: [PATCH 12/71] Bugfixes and get prep training data running

---
 src/climate_downscale/model/prepare_training_data.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/climate_downscale/model/prepare_training_data.py b/src/climate_downscale/model/prepare_training_data.py
index c11bc60..5c21f4b 100644
--- a/src/climate_downscale/model/prepare_training_data.py
+++ b/src/climate_downscale/model/prepare_training_data.py
@@ -85,8 +85,6 @@ def prepare_training_data_main(output_dir: str | Path, year: str) -> None:
     data.loc[missing_elevation, "elevation"] = data.loc[
         missing_elevation, "target_elevation"
     ]
-    still_missing_elevation = data["elevation"] < nodata_val
-    data = data.loc[~still_missing_elevation]
 
     # Local climate zone
     data["target_lcz"] = cd_data.load_predictor("lcz_target").select(
@@ -111,7 +109,7 @@ def prepare_training_data_task(output_dir: str, year: str) -> None:
 @clio.with_queue()
 def prepare_training_data(output_dir: str, queue: str) -> None:
     jobmon.run_parallel(
-        "prepare training data",
+        "model prepare_training_data",
         node_args={
             "output-dir": [output_dir],
             "year": clio.VALID_YEARS,
@@ -119,8 +117,8 @@ def prepare_training_data(output_dir: str, queue: str) -> None:
         task_resources={
             "queue": queue,
             "cores": 1,
-            "memory": "10G",
-            "runtime": "240m",
+            "memory": "30G",
+            "runtime": "30m",
             "project": "proj_rapidresponse",
         },
         runner="cdtask",

From 1ec16f76b8d9d020f8239e283cd3d4f8c2fcaa25 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Fri, 24 May 2024 15:08:26 -0700
Subject: [PATCH 13/71] Add station id and remove experimental data

---
 src/climate_downscale/model/prepare_training_data.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/climate_downscale/model/prepare_training_data.py b/src/climate_downscale/model/prepare_training_data.py
index 5c21f4b..5122eff 100644
--- a/src/climate_downscale/model/prepare_training_data.py
+++ b/src/climate_downscale/model/prepare_training_data.py
@@ -22,6 +22,7 @@ def load_and_clean_climate_stations(
         "LONGITUDE": "lon",
         "TEMP": "temperature",
         "ELEVATION": "ncei_elevation",
+        "STATION": "station_id",
     }
     climate_stations = (
         climate_stations.rename(columns=column_map)
@@ -55,7 +56,9 @@ def get_era5_temperature(
     )
 
     if "expver" in era5.coords:
-        era5 = era5.sel(expver=1).combine_first(era5.sel(expver=5))
+        # expver == 1 is final data.  expver == 5 is provisional data
+        # and has a very strong nonsense seasonal trend.
+        era5 = era5.sel(expver=1)
     return era5["t2m"].to_numpy() - 273.15
 
 
@@ -93,6 +96,7 @@ def prepare_training_data_main(output_dir: str | Path, year: str) -> None:
     data["era5_lcz"] = cd_data.load_predictor("lcz_era5").select(
         coords["lon"], coords["lat"]
     )
+  
 
     cd_data.save_training_data(data, year)
 

From 454ce1a5635cb3509b8ddf31a363e4d99409773c Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Fri, 24 May 2024 15:11:44 -0700
Subject: [PATCH 14/71] Uppdate rra tools and jobmon usage

---
 poetry.lock                                    | 18 +++++++++---------
 pyproject.toml                                 |  2 +-
 src/climate_downscale/extract/elevation.py     |  6 ++++--
 src/climate_downscale/extract/era5.py          |  6 ++++--
 .../extract/ncei_climate_stations.py           |  8 +++++---
 .../model/prepare_predictors.py                |  8 +++++---
 .../model/prepare_training_data.py             |  8 +++++---
 7 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 310bfd0..b442f1f 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1969,7 +1969,6 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2178,13 +2177,13 @@ files = [
 
 [[package]]
 name = "requests"
-version = "2.31.0"
+version = "2.32.2"
 description = "Python HTTP for Humans."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
-    {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
+    {file = "requests-2.32.2-py3-none-any.whl", hash = "sha256:fc06670dd0ed212426dfeb94fc1b983d917c4f9847c863f313c9dfaaffb7c23c"},
+    {file = "requests-2.32.2.tar.gz", hash = "sha256:dd951ff5ecf3e3b3aa26b40703ba77495dab41da839ae72ef3c8e5d8e2433289"},
 ]
 
 [package.dependencies]
@@ -2199,13 +2198,13 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
 [[package]]
 name = "rra-tools"
-version = "1.0.6"
+version = "1.0.8"
 description = "Common utilities for IHME Rapid Response team pipelines."
 optional = false
 python-versions = "<4.0,>=3.10"
 files = [
-    {file = "rra_tools-1.0.6-py3-none-any.whl", hash = "sha256:51d2f56e0b6fc13a7198e517aa61aed5fcd85c625bfb0e953a9e81ef12a4f7d9"},
-    {file = "rra_tools-1.0.6.tar.gz", hash = "sha256:9abc933b7c0efc2a899d56fd71e9c89a21c82950788c0bbc559200615a17880d"},
+    {file = "rra_tools-1.0.8-py3-none-any.whl", hash = "sha256:d499aa58403c2b26486a3f9a892239945aee4321067b64aac027ad5e86f39a48"},
+    {file = "rra_tools-1.0.8.tar.gz", hash = "sha256:9d4bf15c4ce60a3af6b55e4e6d158446c91e880a9dd89f6fdd0c72a2d633f888"},
 ]
 
 [package.dependencies]
@@ -2214,6 +2213,7 @@ deep-translator = ">=1.11.4,<2.0.0"
 loguru = ">=0.7.2,<0.8.0"
 pandas = ">=2.2.2,<3.0.0"
 pathos = ">=0.3.2,<0.4.0"
+requests = ">=2.32.2,<3.0.0"
 tqdm = ">=4.66.4,<5.0.0"
 
 [[package]]
@@ -2683,4 +2683,4 @@ viz = ["matplotlib", "nc-time-axis", "seaborn"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10, <3.13"
-content-hash = "76b81344dbd944abdc6006c3ce2e8a8ce7ae3131f747247396f1fb01ab80e1a3"
+content-hash = "f26dfb9999164fb0037e8fd2b96ea27324abfb50902d9f5e2567717902199f23"
diff --git a/pyproject.toml b/pyproject.toml
index 79a58f4..73fd3c8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,7 @@ xarray = "^2024.3.0"
 cdsapi = "^0.7.0"
 matplotlib = "^3.8.4"
 scikit-learn = "^1.4.2"
-rra-tools = "^1.0.6"
+rra-tools = "^1.0.8"
 netcdf4 = "^1.6.5"
 pyarrow = "^16.0.0"
 types-requests = "^2.31.0.20240406"
diff --git a/src/climate_downscale/extract/elevation.py b/src/climate_downscale/extract/elevation.py
index 5549886..c12b294 100644
--- a/src/climate_downscale/extract/elevation.py
+++ b/src/climate_downscale/extract/elevation.py
@@ -115,13 +115,16 @@ def extract_elevation(
     lon_starts = list(range(-180, 180, FETCH_SIZE))
 
     jobmon.run_parallel(
+        runner="cdtask",
         task_name="extract_era5",
         node_args={
-            "output-dir": [output_dir],
             "model-name": [model_name],
             "lat-start": lat_starts,
             "lon-start": lon_starts,
         },
+        task_args={
+            "output-dir": output_dir,
+        },
         task_resources={
             "queue": queue,
             "cores": 1,
@@ -129,5 +132,4 @@ def extract_elevation(
             "runtime": "240m",
             "project": "proj_rapidresponse",
         },
-        runner="cdtask",
     )
diff --git a/src/climate_downscale/extract/era5.py b/src/climate_downscale/extract/era5.py
index e27ad8e..38c9ea9 100644
--- a/src/climate_downscale/extract/era5.py
+++ b/src/climate_downscale/extract/era5.py
@@ -72,12 +72,15 @@ def extract_era5(
     variables = clio.VALID_CLIMATE_VARIABLES if variable == clio.RUN_ALL else [variable]
 
     jobmon.run_parallel(
+        runner="cdtask",
         task_name="extract_era5",
         node_args={
-            "output-dir": [output_dir],
             "year": years,
             "variable": variables,
         },
+        task_args={
+            "output-dir": output_dir,
+        },
         task_resources={
             "queue": queue,
             "cores": 1,
@@ -85,5 +88,4 @@ def extract_era5(
             "runtime": "240m",
             "project": "proj_rapidresponse",
         },
-        runner="cdtask",
     )
diff --git a/src/climate_downscale/extract/ncei_climate_stations.py b/src/climate_downscale/extract/ncei_climate_stations.py
index 048f6a8..c8d770e 100644
--- a/src/climate_downscale/extract/ncei_climate_stations.py
+++ b/src/climate_downscale/extract/ncei_climate_stations.py
@@ -54,11 +54,14 @@ def extract_ncei_climate_stations_task(output_dir: str, year: str) -> None:
 @with_queue()
 def extract_ncei_climate_stations(output_dir: str, queue: str) -> None:
     jobmon.run_parallel(
-        "extract ncei",
+        runner="cdtask",
+        task_name="extract ncei",
         node_args={
-            "output-dir": [output_dir],
             "year": EXTRACTION_YEARS,
         },
+        task_args={
+            "output-dir": output_dir,
+        },
         task_resources={
             "queue": queue,
             "cores": 1,
@@ -66,5 +69,4 @@ def extract_ncei_climate_stations(output_dir: str, queue: str) -> None:
             "runtime": "240m",
             "project": "proj_rapidresponse",
         },
-        runner="cdtask",
     )
diff --git a/src/climate_downscale/model/prepare_predictors.py b/src/climate_downscale/model/prepare_predictors.py
index f710ace..c858c90 100644
--- a/src/climate_downscale/model/prepare_predictors.py
+++ b/src/climate_downscale/model/prepare_predictors.py
@@ -123,12 +123,15 @@ def prepare_predictors_task(
 @clio.with_queue()
 def prepare_predictors(output_dir: str, queue: str) -> None:
     jobmon.run_parallel(
-        "model prepare_predictors",
+        runner="cdtask",
+        task_name="model prepare_predictors",
         node_args={
-            "output-dir": [output_dir],
             "lat-start": clio.LATITUDES,
             "lon-start": clio.LONGITUDES,
         },
+        task_args={
+            "output-dir": output_dir,
+        },
         task_resources={
             "queue": queue,
             "cores": 1,
@@ -136,5 +139,4 @@ def prepare_predictors(output_dir: str, queue: str) -> None:
             "runtime": "45m",
             "project": "proj_rapidresponse",
         },
-        runner="cdtask",
     )
diff --git a/src/climate_downscale/model/prepare_training_data.py b/src/climate_downscale/model/prepare_training_data.py
index c11bc60..081540f 100644
--- a/src/climate_downscale/model/prepare_training_data.py
+++ b/src/climate_downscale/model/prepare_training_data.py
@@ -111,11 +111,14 @@ def prepare_training_data_task(output_dir: str, year: str) -> None:
 @clio.with_queue()
 def prepare_training_data(output_dir: str, queue: str) -> None:
     jobmon.run_parallel(
-        "prepare training data",
+        runner="cdtask",
+        task_name="prepare training data",
         node_args={
-            "output-dir": [output_dir],
             "year": clio.VALID_YEARS,
         },
+        task_args={
+            "output-dir": output_dir,
+        },
         task_resources={
             "queue": queue,
             "cores": 1,
@@ -123,5 +126,4 @@ def prepare_training_data(output_dir: str, queue: str) -> None:
             "runtime": "240m",
             "project": "proj_rapidresponse",
         },
-        runner="cdtask",
     )

From bad9727d33b2b913f5cffb04bc50bbaef96c58eb Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Fri, 24 May 2024 16:22:13 -0700
Subject: [PATCH 15/71] REvamp era5 download script

---
 src/climate_downscale/cli_options.py  | 20 ++++++++
 src/climate_downscale/data.py         | 10 ++++
 src/climate_downscale/extract/era5.py | 68 ++++++++++++++-------------
 3 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/src/climate_downscale/cli_options.py b/src/climate_downscale/cli_options.py
index a1eeeec..aa5f054 100644
--- a/src/climate_downscale/cli_options.py
+++ b/src/climate_downscale/cli_options.py
@@ -52,6 +52,8 @@ def with_month(
 VALID_CLIMATE_VARIABLES = [
     "total_precipitation",
     "2m_temperature",
+    "2m_dewpoint_temperature",
+    "surface_pressure",
 ]
 
 
@@ -68,6 +70,22 @@ def with_climate_variable(
     )
 
 
+VALID_ERA5_DATASETS = ["reanalysis-era5-land", "reanalysis-era5-single-levels"]
+
+
+def with_era5_dataset(
+    *,
+    allow_all: bool = False,
+) -> ClickOption[_P, _T]:
+    return with_choice(
+        "era5-dataset",
+        "d",
+        allow_all=allow_all,
+        choices=VALID_ERA5_DATASETS,
+        help="Dataset to extract.",
+    )
+
+
 STRIDE = 30
 LATITUDES = [str(lat) for lat in range(-90, 90, STRIDE)]
 LONGITUDES = [str(lon) for lon in range(-180, 180, STRIDE)]
@@ -101,12 +119,14 @@ def with_lon_start(
     "VALID_YEARS",
     "VALID_MONTHS",
     "VALID_CLIMATE_VARIABLES",
+    "VALID_DATASETS",
     "STRIDE",
     "LATITUDES",
     "LONGITUDES",
     "with_year",
     "with_month",
     "with_climate_variable",
+    "with_dataset",
     "with_lat_start",
     "with_lon_start",
     "with_output_directory",
diff --git a/src/climate_downscale/data.py b/src/climate_downscale/data.py
index 6ca01d7..7535bac 100644
--- a/src/climate_downscale/data.py
+++ b/src/climate_downscale/data.py
@@ -26,6 +26,16 @@ def credentials_root(self) -> Path:
     def extracted_data(self) -> Path:
         return self.root / "extracted_data"
 
+    @property
+    def era5(self) -> Path:
+        return self.extracted_data / "era5"
+
+    def era5_path(self, dataset: str, variable: str, year: int | str) -> Path:
+        return self.era5 / f"{dataset}_{variable}_{year}.nc"
+
+    def load_era5(self, dataset: str, variable: str, year: int | str) -> xr.Dataset:
+        return xr.open_dataset(self.era5_path(dataset, variable, year))
+
     @property
     def era5_temperature_daily_mean(self) -> Path:
         return self.extracted_data / "era5_temperature_daily_mean"
diff --git a/src/climate_downscale/extract/era5.py b/src/climate_downscale/extract/era5.py
index 38c9ea9..69645c5 100644
--- a/src/climate_downscale/extract/era5.py
+++ b/src/climate_downscale/extract/era5.py
@@ -11,9 +11,9 @@
 
 def extract_era5_main(
     output_dir: str | Path,
+    era5_dataset: str,
+    climate_variable: str,
     year: int | str,
-    month: str,
-    variable: str,
 ) -> None:
     cddata = ClimateDownscaleData(output_dir)
     cred_path = cddata.credentials_root / "copernicus.txt"
@@ -21,62 +21,66 @@ def extract_era5_main(
 
     copernicus = cdsapi.Client(url=url, key=key)
     kwargs = {
-        "dataset": "reanalysis-era5-land",
         "product_type": "reanalysis",
-        "statistic": "daily_mean",
-        "variable": "total_precipitation",
-        "year": "2020",
-        "month": "01",
-        "time_zone": "UTC+00:00",
-        "frequency": "1-hourly",
-        "grid": "0.1/0.1",
-        "area": {"lat": [-90, 90], "lon": [-180, 180]},
+        "variable": climate_variable,
+        "year": year,
+        "month": clio.VALID_MONTHS,
+        "time": [f"{h:02d}:00" for h in range(0, 24)],
+        "format": "netcdf",
     }
-    result = copernicus.service(
-        "tool.toolbox.orchestrator.workflow",
-        params={
-            "realm": "user-apps",
-            "project": "app-c3s-daily-era5-statistics",
-            "version": "master",
-            "kwargs": kwargs,
-            "workflow_name": "application",
-        },
-    )
-
-    out_path = cddata.era5_temperature_daily_mean / f"{variable}_{year}_{month}.nc"
+    out_path = cddata.era5_path(era5_dataset, climate_variable, year)
     touch(out_path, exist_ok=True)
-    copernicus.download(result, [out_path])
+
+    copernicus.retrieve(
+        era5_dataset,
+        kwargs,
+        out_path,
+    )
 
 
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
-@clio.with_year()
-@clio.with_month()
+@clio.with_era5_dataset()
 @clio.with_climate_variable()
-def extract_era5_task(year: str, month: str, climate_variable: str) -> None:
-    extract_era5_main(DEFAULT_ROOT, year, month, climate_variable)
+@clio.with_year()
+def extract_era5_task(
+    output_dir: str,
+    era5_dataset: str,
+    climate_variable: str,
+    year: str,
+) -> None:
+    extract_era5_main(
+        output_dir,
+        era5_dataset,
+        climate_variable,
+        year,
+    )
 
 
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
-@clio.with_year(allow_all=True)
+@clio.with_era5_dataset(allow_all=True)
 @clio.with_climate_variable(allow_all=True)
+@clio.with_year(allow_all=True)
 @clio.with_queue()
 def extract_era5(
     output_dir: str,
+    era5_dataset: str,
+    climate_variable: str,
     year: str,
-    variable: str,
     queue: str,
 ) -> None:
+    datasets = clio.VALID_ERA5_DATASETS if era5_dataset == clio.RUN_ALL else [era5_dataset]
+    variables = clio.VALID_CLIMATE_VARIABLES if climate_variable == clio.RUN_ALL else [climate_variable]
     years = clio.VALID_YEARS if year == clio.RUN_ALL else [year]
-    variables = clio.VALID_CLIMATE_VARIABLES if variable == clio.RUN_ALL else [variable]
 
     jobmon.run_parallel(
         runner="cdtask",
         task_name="extract_era5",
         node_args={
+            "era5-dataset": datasets,
+            "climate-variable": variables,
             "year": years,
-            "variable": variables,
         },
         task_args={
             "output-dir": output_dir,

From e8f41bec169e1b2651cc9b3f699b60f6364d99e8 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Fri, 24 May 2024 16:27:32 -0700
Subject: [PATCH 16/71] Add month

---
 src/climate_downscale/data.py         | 14 +++++++++-----
 src/climate_downscale/extract/era5.py | 26 ++++++++++++++++++++------
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/src/climate_downscale/data.py b/src/climate_downscale/data.py
index 7535bac..48ba036 100644
--- a/src/climate_downscale/data.py
+++ b/src/climate_downscale/data.py
@@ -30,11 +30,15 @@ def extracted_data(self) -> Path:
     def era5(self) -> Path:
         return self.extracted_data / "era5"
 
-    def era5_path(self, dataset: str, variable: str, year: int | str) -> Path:
-        return self.era5 / f"{dataset}_{variable}_{year}.nc"
-
-    def load_era5(self, dataset: str, variable: str, year: int | str) -> xr.Dataset:
-        return xr.open_dataset(self.era5_path(dataset, variable, year))
+    def era5_path(
+        self, dataset: str, variable: str, year: int | str, month: str
+    ) -> Path:
+        return self.era5 / f"{dataset}_{variable}_{year}_{month}.nc"
+
+    def load_era5(
+        self, dataset: str, variable: str, year: int | str, month: str
+    ) -> xr.Dataset:
+        return xr.open_dataset(self.era5_path(dataset, variable, year, month))
 
     @property
     def era5_temperature_daily_mean(self) -> Path:
diff --git a/src/climate_downscale/extract/era5.py b/src/climate_downscale/extract/era5.py
index 69645c5..3c73182 100644
--- a/src/climate_downscale/extract/era5.py
+++ b/src/climate_downscale/extract/era5.py
@@ -14,6 +14,7 @@ def extract_era5_main(
     era5_dataset: str,
     climate_variable: str,
     year: int | str,
+    month: str,
 ) -> None:
     cddata = ClimateDownscaleData(output_dir)
     cred_path = cddata.credentials_root / "copernicus.txt"
@@ -24,11 +25,11 @@ def extract_era5_main(
         "product_type": "reanalysis",
         "variable": climate_variable,
         "year": year,
-        "month": clio.VALID_MONTHS,
-        "time": [f"{h:02d}:00" for h in range(0, 24)],
+        "month": month,
+        "time": [f"{h:02d}:00" for h in range(24)],
         "format": "netcdf",
     }
-    out_path = cddata.era5_path(era5_dataset, climate_variable, year)
+    out_path = cddata.era5_path(era5_dataset, climate_variable, year, month)
     touch(out_path, exist_ok=True)
 
     copernicus.retrieve(
@@ -43,17 +44,20 @@ def extract_era5_main(
 @clio.with_era5_dataset()
 @clio.with_climate_variable()
 @clio.with_year()
+@clio.with_month()
 def extract_era5_task(
     output_dir: str,
     era5_dataset: str,
     climate_variable: str,
     year: str,
+    month: str,
 ) -> None:
     extract_era5_main(
         output_dir,
         era5_dataset,
         climate_variable,
         year,
+        month,
     )
 
 
@@ -62,17 +66,26 @@ def extract_era5_task(
 @clio.with_era5_dataset(allow_all=True)
 @clio.with_climate_variable(allow_all=True)
 @clio.with_year(allow_all=True)
+@clio.with_month(allow_all=True)
 @clio.with_queue()
-def extract_era5(
+def extract_era5(  # noqa: PLR0913
     output_dir: str,
     era5_dataset: str,
     climate_variable: str,
     year: str,
+    month: str,
     queue: str,
 ) -> None:
-    datasets = clio.VALID_ERA5_DATASETS if era5_dataset == clio.RUN_ALL else [era5_dataset]
-    variables = clio.VALID_CLIMATE_VARIABLES if climate_variable == clio.RUN_ALL else [climate_variable]
+    datasets = (
+        clio.VALID_ERA5_DATASETS if era5_dataset == clio.RUN_ALL else [era5_dataset]
+    )
+    variables = (
+        clio.VALID_CLIMATE_VARIABLES
+        if climate_variable == clio.RUN_ALL
+        else [climate_variable]
+    )
     years = clio.VALID_YEARS if year == clio.RUN_ALL else [year]
+    months = clio.VALID_MONTHS if month == clio.RUN_ALL else [month]
 
     jobmon.run_parallel(
         runner="cdtask",
@@ -81,6 +94,7 @@ def extract_era5(
             "era5-dataset": datasets,
             "climate-variable": variables,
             "year": years,
+            "month": months,
         },
         task_args={
             "output-dir": output_dir,

From 407bdc51ffbdce6d8e9cf95e0b6d3602dc6ec349 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Fri, 24 May 2024 16:36:12 -0700
Subject: [PATCH 17/71] Need to request day

---
 src/climate_downscale/extract/era5.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/climate_downscale/extract/era5.py b/src/climate_downscale/extract/era5.py
index 879a13b..a0055b7 100644
--- a/src/climate_downscale/extract/era5.py
+++ b/src/climate_downscale/extract/era5.py
@@ -26,6 +26,7 @@ def extract_era5_main(
         "variable": climate_variable,
         "year": year,
         "month": month,
+        "day": [f"{d:02d}" for d in range(1, 32)],
         "time": [f"{h:02d}:00" for h in range(24)],
         "format": "netcdf",
     }

From ae327778ec492470fb0c5aff3eee4288a1718f7d Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Fri, 24 May 2024 16:45:43 -0700
Subject: [PATCH 18/71] Add caching

---
 src/climate_downscale/extract/era5.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/climate_downscale/extract/era5.py b/src/climate_downscale/extract/era5.py
index a0055b7..7c2020d 100644
--- a/src/climate_downscale/extract/era5.py
+++ b/src/climate_downscale/extract/era5.py
@@ -31,13 +31,21 @@ def extract_era5_main(
         "format": "netcdf",
     }
     out_path = cddata.era5_path(era5_dataset, climate_variable, year, month)
-    touch(out_path, exist_ok=True)
+    if out_path.exists():
+        print("Already extracted:", out_path)
+        return
 
-    copernicus.retrieve(
-        era5_dataset,
-        kwargs,
-        out_path,
-    )
+    touch(out_path)
+    try:
+        result = copernicus.retrieve(
+            era5_dataset,
+            kwargs,
+        )
+        result.download(out_path)
+    except Exception as e:
+        print(f"Failed to download {era5_dataset} {climate_variable} {year} {month}")
+        out_path.unlink()
+        raise e  # noqa: TRY201
 
 
 @click.command()  # type: ignore[arg-type]

From e95d05088f341d42fa64b6deaf457790c9476c85 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Mon, 27 May 2024 15:08:33 -0700
Subject: [PATCH 19/71] compress files

---
 src/climate_downscale/extract/era5.py | 82 ++++++++++++++++++++-------
 1 file changed, 61 insertions(+), 21 deletions(-)

diff --git a/src/climate_downscale/extract/era5.py b/src/climate_downscale/extract/era5.py
index 7c2020d..364ea8d 100644
--- a/src/climate_downscale/extract/era5.py
+++ b/src/climate_downscale/extract/era5.py
@@ -4,6 +4,7 @@
 import click
 from rra_tools import jobmon
 from rra_tools.shell_tools import touch
+import xarray as xr
 
 from climate_downscale import cli_options as clio
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
@@ -19,34 +20,73 @@ def extract_era5_main(
     cddata = ClimateDownscaleData(output_dir)
     cred_path = cddata.credentials_root / "copernicus.txt"
     url, key = cred_path.read_text().strip().split("\n")
-
-    copernicus = cdsapi.Client(url=url, key=key)
-    kwargs = {
-        "product_type": "reanalysis",
-        "variable": climate_variable,
-        "year": year,
-        "month": month,
-        "day": [f"{d:02d}" for d in range(1, 32)],
-        "time": [f"{h:02d}:00" for h in range(24)],
-        "format": "netcdf",
-    }
+    
     out_path = cddata.era5_path(era5_dataset, climate_variable, year, month)
+    raw_out_path = out_path.with_stem(f"{out_path.stem}_raw")
+    
     if out_path.exists():
-        print("Already extracted:", out_path)
-        return
+        if raw_out_path.exists():
+            # We ran into an error before completing compression, likely a
+            # memory error. Delete and retry.
+            out_path.unlink()
+        else:
+            print("Already extracted:", out_path)
+            return
+    
+    try:
+        if not raw_out_path.exists():
+            return
+            touch(raw_out_path)
+
+            print('Connecting to copernicus')
+            copernicus = cdsapi.Client(url=url, key=key)
+            kwargs = {
+                "product_type": "reanalysis",
+                "variable": climate_variable,
+                "year": year,
+                "month": month,
+                "day": [f"{d:02d}" for d in range(1, 32)],
+                "time": [f"{h:02d}:00" for h in range(24)],
+                "format": "netcdf",
+            }
+            print("Downloading...")
+            result = copernicus.retrieve(
+                era5_dataset,
+                kwargs,
+            )
+            result.download(raw_out_path)
+        else:
+            print("Already downloaded:", raw_out_path)
+    except Exception as e:
+        print(f"Failed to download {era5_dataset} {climate_variable} {year} {month}")
+        if raw_out_path.exists():
+            raw_out_path.unlink()        
+        raise e  # noqa: TRY201
 
     touch(out_path)
     try:
-        result = copernicus.retrieve(
-            era5_dataset,
-            kwargs,
+        print("Compressing...")
+        ds = xr.open_dataset(raw_out_path)
+        var_name = list(ds)[0]  # These are all single variable datasets    
+        og_encoding = ds[var_name].encoding
+        ds.to_netcdf(
+            out_path,
+            encoding={
+                var_name:{
+                    **og_encoding,
+                    "zlib": True,
+                    "complevel": 1,
+                }
+            }
         )
-        result.download(out_path)
+        
     except Exception as e:
-        print(f"Failed to download {era5_dataset} {climate_variable} {year} {month}")
-        out_path.unlink()
+        print(f'Failed to compress {era5_dataset} {climate_variable} {year} {month}')
+        if out_path.exists():
+            out_path.unlink()
         raise e  # noqa: TRY201
 
+    raw_out_path.unlink()
 
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
@@ -111,8 +151,8 @@ def extract_era5(  # noqa: PLR0913
         task_resources={
             "queue": queue,
             "cores": 1,
-            "memory": "10G",
-            "runtime": "240m",
+            "memory": "120G",
+            "runtime": "600m",
             "project": "proj_rapidresponse",
         },
     )

From a07cbfcc41fd086d8e1d9ea5eda1311c5194adf4 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Mon, 27 May 2024 17:32:49 -0700
Subject: [PATCH 20/71] Add infrastructure to download different filetypes and
 do compression on results

---
 poetry.lock                               |   8 +-
 pyproject.toml                            |   2 +-
 src/climate_downscale/cli_options.py      |   4 +-
 src/climate_downscale/extract/__init__.py |   6 +-
 src/climate_downscale/extract/era5.py     | 236 +++++++++++++++-------
 5 files changed, 173 insertions(+), 83 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index b442f1f..1bd9e79 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2198,13 +2198,13 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
 [[package]]
 name = "rra-tools"
-version = "1.0.8"
+version = "1.0.9"
 description = "Common utilities for IHME Rapid Response team pipelines."
 optional = false
 python-versions = "<4.0,>=3.10"
 files = [
-    {file = "rra_tools-1.0.8-py3-none-any.whl", hash = "sha256:d499aa58403c2b26486a3f9a892239945aee4321067b64aac027ad5e86f39a48"},
-    {file = "rra_tools-1.0.8.tar.gz", hash = "sha256:9d4bf15c4ce60a3af6b55e4e6d158446c91e880a9dd89f6fdd0c72a2d633f888"},
+    {file = "rra_tools-1.0.9-py3-none-any.whl", hash = "sha256:9deb367bfb13a627df36263f6771b9c10ff8cd0458915750201b046d4343c7bd"},
+    {file = "rra_tools-1.0.9.tar.gz", hash = "sha256:fe5040ade3a49498f124ec557778743f47c3e95d83de48aedd197c71abd29e78"},
 ]
 
 [package.dependencies]
@@ -2683,4 +2683,4 @@ viz = ["matplotlib", "nc-time-axis", "seaborn"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10, <3.13"
-content-hash = "f26dfb9999164fb0037e8fd2b96ea27324abfb50902d9f5e2567717902199f23"
+content-hash = "eac4d5a666c56578b00e14d3cf04ffe5ce70619ef1e1b25c374781f4e4d08e61"
diff --git a/pyproject.toml b/pyproject.toml
index 73fd3c8..6b2cb8b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,7 @@ xarray = "^2024.3.0"
 cdsapi = "^0.7.0"
 matplotlib = "^3.8.4"
 scikit-learn = "^1.4.2"
-rra-tools = "^1.0.8"
+rra-tools = "^1.0.9"
 netcdf4 = "^1.6.5"
 pyarrow = "^16.0.0"
 types-requests = "^2.31.0.20240406"
diff --git a/src/climate_downscale/cli_options.py b/src/climate_downscale/cli_options.py
index aa5f054..fb75ae2 100644
--- a/src/climate_downscale/cli_options.py
+++ b/src/climate_downscale/cli_options.py
@@ -119,14 +119,14 @@ def with_lon_start(
     "VALID_YEARS",
     "VALID_MONTHS",
     "VALID_CLIMATE_VARIABLES",
-    "VALID_DATASETS",
+    "VALID_ERA5_DATASETS",
     "STRIDE",
     "LATITUDES",
     "LONGITUDES",
     "with_year",
     "with_month",
     "with_climate_variable",
-    "with_dataset",
+    "with_era5_dataset",
     "with_lat_start",
     "with_lon_start",
     "with_output_directory",
diff --git a/src/climate_downscale/extract/__init__.py b/src/climate_downscale/extract/__init__.py
index 7651931..364bcf6 100644
--- a/src/climate_downscale/extract/__init__.py
+++ b/src/climate_downscale/extract/__init__.py
@@ -3,8 +3,9 @@
     extract_elevation_task,
 )
 from climate_downscale.extract.era5 import (
+    download_era5_task,
     extract_era5,
-    extract_era5_task,
+    unzip_and_compress_era5_task,
 )
 from climate_downscale.extract.ncei_climate_stations import (
     extract_ncei_climate_stations,
@@ -23,7 +24,8 @@
 
 TASK_RUNNERS = {
     "ncei": extract_ncei_climate_stations_task,
-    "era5": extract_era5_task,
+    "era5_download": download_era5_task,
+    "era5_compress": unzip_and_compress_era5_task,
     "lcz": extract_rub_local_climate_zones,
     "elevation": extract_elevation_task,
 }
diff --git a/src/climate_downscale/extract/era5.py b/src/climate_downscale/extract/era5.py
index 364ea8d..12f0535 100644
--- a/src/climate_downscale/extract/era5.py
+++ b/src/climate_downscale/extract/era5.py
@@ -1,16 +1,30 @@
+import itertools
+import zipfile
 from pathlib import Path
 
 import cdsapi
 import click
+import xarray as xr
 from rra_tools import jobmon
 from rra_tools.shell_tools import touch
-import xarray as xr
 
 from climate_downscale import cli_options as clio
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
 
 
-def extract_era5_main(
+def get_download_spec(
+    final_out_path: Path,
+) -> tuple[Path, str]:
+    if "land" in final_out_path.stem:
+        download_path = final_out_path.with_suffix(".zip")
+        download_format = "netcdf.zip"
+    else:
+        download_path = final_out_path.with_stem(f"{final_out_path.stem}_raw")
+        download_format = "netcdf"
+    return download_path, download_format
+
+
+def download_era5_main(
     output_dir: str | Path,
     era5_dataset: str,
     climate_variable: str,
@@ -18,75 +32,84 @@ def extract_era5_main(
     month: str,
 ) -> None:
     cddata = ClimateDownscaleData(output_dir)
-    cred_path = cddata.credentials_root / "copernicus.txt"
-    url, key = cred_path.read_text().strip().split("\n")
-    
-    out_path = cddata.era5_path(era5_dataset, climate_variable, year, month)
-    raw_out_path = out_path.with_stem(f"{out_path.stem}_raw")
-    
-    if out_path.exists():
-        if raw_out_path.exists():
-            # We ran into an error before completing compression, likely a
-            # memory error. Delete and retry.
-            out_path.unlink()
-        else:
-            print("Already extracted:", out_path)
-            return
-    
+
+    final_out_path = cddata.era5_path(era5_dataset, climate_variable, year, month)
+    download_path, download_format = get_download_spec(final_out_path)
+
+    if download_path.exists():
+        print("Already downloaded:", download_path)
+        return
+
     try:
-        if not raw_out_path.exists():
-            return
-            touch(raw_out_path)
-
-            print('Connecting to copernicus')
-            copernicus = cdsapi.Client(url=url, key=key)
-            kwargs = {
-                "product_type": "reanalysis",
-                "variable": climate_variable,
-                "year": year,
-                "month": month,
-                "day": [f"{d:02d}" for d in range(1, 32)],
-                "time": [f"{h:02d}:00" for h in range(24)],
-                "format": "netcdf",
-            }
-            print("Downloading...")
-            result = copernicus.retrieve(
-                era5_dataset,
-                kwargs,
-            )
-            result.download(raw_out_path)
-        else:
-            print("Already downloaded:", raw_out_path)
+        touch(download_path)
+
+        print("Connecting to copernicus")
+
+        cred_path = cddata.credentials_root / "copernicus.txt"
+        url, key = cred_path.read_text().strip().split("\n")
+        copernicus = cdsapi.Client(url=url, key=key)
+
+        print("Downloading...")
+        kwargs = {
+            "product_type": "reanalysis",
+            "variable": climate_variable,
+            "year": year,
+            "month": month,
+            "day": [f"{d:02d}" for d in range(1, 32)],
+            "time": [f"{h:02d}:00" for h in range(24)],
+            "format": download_format,
+        }
+
+        result = copernicus.retrieve(
+            era5_dataset,
+            kwargs,
+        )
+        result.download(download_path)
     except Exception as e:
         print(f"Failed to download {era5_dataset} {climate_variable} {year} {month}")
-        if raw_out_path.exists():
-            raw_out_path.unlink()        
+        if download_path.exists():
+            download_path.unlink()
         raise e  # noqa: TRY201
 
-    touch(out_path)
-    try:
-        print("Compressing...")
-        ds = xr.open_dataset(raw_out_path)
-        var_name = list(ds)[0]  # These are all single variable datasets    
-        og_encoding = ds[var_name].encoding
-        ds.to_netcdf(
-            out_path,
-            encoding={
-                var_name:{
-                    **og_encoding,
-                    "zlib": True,
-                    "complevel": 1,
-                }
+
+def unzip_and_compress_era5(
+    output_dir: str | Path,
+    era5_dataset: str,
+    climate_variable: str,
+    year: int | str,
+    month: str,
+) -> None:
+    cddata = ClimateDownscaleData(output_dir)
+    final_out_path = cddata.era5_path(era5_dataset, climate_variable, year, month)
+    uncompressed_path = final_out_path.with_stem(f"{final_out_path.stem}_raw")
+
+    if era5_dataset == "reanalysis-era5-land":
+        print("Unzipping...")
+        # This data needs to be unzipped first.
+        zip_path = final_out_path.with_suffix(".zip")
+        touch(uncompressed_path)
+        with zipfile.ZipFile(zip_path) as zf:
+            zinfo = zf.infolist()
+            if len(zinfo) != 1:
+                msg = f"Expected a single file in {zip_path}"
+                raise ValueError(msg)
+            zf.extract(zinfo[0], uncompressed_path)
+
+    touch(final_out_path)
+    ds = xr.open_dataset(final_out_path)
+    var_name = next(iter(ds))  # These are all single variable datasets
+    og_encoding = ds[var_name].encoding
+    ds.to_netcdf(
+        final_out_path,
+        encoding={
+            var_name: {
+                **og_encoding,
+                "zlib": True,
+                "complevel": 1,
             }
-        )
-        
-    except Exception as e:
-        print(f'Failed to compress {era5_dataset} {climate_variable} {year} {month}')
-        if out_path.exists():
-            out_path.unlink()
-        raise e  # noqa: TRY201
+        },
+    )
 
-    raw_out_path.unlink()
 
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
@@ -94,14 +117,36 @@ def extract_era5_main(
 @clio.with_climate_variable()
 @clio.with_year()
 @clio.with_month()
-def extract_era5_task(
+def download_era5_task(
     output_dir: str,
     era5_dataset: str,
     climate_variable: str,
     year: str,
     month: str,
 ) -> None:
-    extract_era5_main(
+    download_era5_main(
+        output_dir,
+        era5_dataset,
+        climate_variable,
+        year,
+        month,
+    )
+
+
+@click.command()  # type: ignore[arg-type]
+@clio.with_output_directory(DEFAULT_ROOT)
+@clio.with_era5_dataset()
+@clio.with_climate_variable()
+@clio.with_year()
+@clio.with_month()
+def unzip_and_compress_era5_task(
+    output_dir: str,
+    era5_dataset: str,
+    climate_variable: str,
+    year: str,
+    month: str,
+) -> None:
+    unzip_and_compress_era5(
         output_dir,
         era5_dataset,
         climate_variable,
@@ -125,6 +170,8 @@ def extract_era5(  # noqa: PLR0913
     month: str,
     queue: str,
 ) -> None:
+    cddata = ClimateDownscaleData(output_dir)
+
     datasets = (
         clio.VALID_ERA5_DATASETS if era5_dataset == clio.RUN_ALL else [era5_dataset]
     )
@@ -136,23 +183,64 @@ def extract_era5(  # noqa: PLR0913
     years = clio.VALID_YEARS if year == clio.RUN_ALL else [year]
     months = clio.VALID_MONTHS if month == clio.RUN_ALL else [month]
 
+    to_download = []
+    to_compress = []
+    for dataset, variable, year, month in itertools.product(
+        datasets, variables, years, months
+    ):
+        final_out_path = cddata.era5_path(era5_dataset, climate_variable, year, month)
+        download_path, _ = get_download_spec(final_out_path)
+
+        if final_out_path.exists() and download_path.exists():
+            # We broke in the middle of processing this file. Don't re-download,
+            # just reprocess.
+            final_out_path.unlink()
+            to_compress.append((dataset, variable, year, month))
+        elif final_out_path.exists():
+            # We've already extracted this dataset
+            continue
+
+        to_download.append((dataset, variable, year, month))
+        to_compress.append((dataset, variable, year, month))
+
     jobmon.run_parallel(
         runner="cdtask",
-        task_name="extract era5",
-        node_args={
-            "era5-dataset": datasets,
-            "climate-variable": variables,
-            "year": years,
-            "month": months,
-        },
+        task_name="extract era5_download",
+        flat_node_args=(
+            ("era5-dataset", "climate-variable", "year", "month"),
+            to_compress,
+        ),
         task_args={
             "output-dir": output_dir,
         },
         task_resources={
             "queue": queue,
             "cores": 1,
-            "memory": "120G",
+            "memory": "10G",
             "runtime": "600m",
             "project": "proj_rapidresponse",
         },
+        max_attempts=1,
+        concurrency_limit=25,
+    )
+
+    jobmon.run_parallel(
+        runner="cdtask",
+        task_name="extract era5_compress",
+        flat_node_args=(
+            ("era5-dataset", "climate-variable", "year", "month"),
+            to_compress,
+        ),
+        task_args={
+            "output-dir": output_dir,
+        },
+        task_resources={
+            "queue": queue,
+            "cores": 1,
+            "memory": "125G",
+            "runtime": "30m",
+            "project": "proj_rapidresponse",
+        },
+        max_attempts=1,
+        concurrency_limit=500,
     )

From f8081d27da00fed6a901fe514bb82cd50d084332 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Sun, 9 Jun 2024 14:31:19 -0700
Subject: [PATCH 21/71] Expand variables and parallelize over users

---
 poetry.lock                           |  10 +-
 pyproject.toml                        |   2 +-
 src/climate_downscale/cli_options.py  |  11 ++-
 src/climate_downscale/extract/era5.py | 129 +++++++++++++++++++-------
 4 files changed, 109 insertions(+), 43 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 1bd9e79..0d9646d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
 [[package]]
 name = "affine"
@@ -2198,13 +2198,13 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
 [[package]]
 name = "rra-tools"
-version = "1.0.9"
+version = "1.0.10"
 description = "Common utilities for IHME Rapid Response team pipelines."
 optional = false
 python-versions = "<4.0,>=3.10"
 files = [
-    {file = "rra_tools-1.0.9-py3-none-any.whl", hash = "sha256:9deb367bfb13a627df36263f6771b9c10ff8cd0458915750201b046d4343c7bd"},
-    {file = "rra_tools-1.0.9.tar.gz", hash = "sha256:fe5040ade3a49498f124ec557778743f47c3e95d83de48aedd197c71abd29e78"},
+    {file = "rra_tools-1.0.10-py3-none-any.whl", hash = "sha256:04a16fb8ca1f60b25360a709367a34497ced5176c506668cfec4dbce7f1b75e7"},
+    {file = "rra_tools-1.0.10.tar.gz", hash = "sha256:9a43e76061d8538c4545fe59a0d8ecc146eed9c5265c59c579bc7bcb00da5677"},
 ]
 
 [package.dependencies]
@@ -2683,4 +2683,4 @@ viz = ["matplotlib", "nc-time-axis", "seaborn"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10, <3.13"
-content-hash = "eac4d5a666c56578b00e14d3cf04ffe5ce70619ef1e1b25c374781f4e4d08e61"
+content-hash = "da6f45d547ceb2940cf87d9792ce11d7115e9b11a405ab3420dce9850d2a092f"
diff --git a/pyproject.toml b/pyproject.toml
index 6b2cb8b..017e751 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,7 @@ xarray = "^2024.3.0"
 cdsapi = "^0.7.0"
 matplotlib = "^3.8.4"
 scikit-learn = "^1.4.2"
-rra-tools = "^1.0.9"
+rra-tools = "^1.0.10"
 netcdf4 = "^1.6.5"
 pyarrow = "^16.0.0"
 types-requests = "^2.31.0.20240406"
diff --git a/src/climate_downscale/cli_options.py b/src/climate_downscale/cli_options.py
index fb75ae2..8bcacfd 100644
--- a/src/climate_downscale/cli_options.py
+++ b/src/climate_downscale/cli_options.py
@@ -50,10 +50,17 @@ def with_month(
 
 
 VALID_CLIMATE_VARIABLES = [
-    "total_precipitation",
-    "2m_temperature",
+    "10m_u_component_of_wind",
+    "10m_v_component_of_wind",
     "2m_dewpoint_temperature",
+    "2m_temperature",
+    "surface_net_solar_radiation",
+    "surface_net_thermal_radiation",
     "surface_pressure",
+    "surface_solar_radiation_downwards",
+    "surface_thermal_radiation_downwards",
+    "total_precipitation",
+    "total_sky_direct_solar_radiation_at_surface",
 ]
 
 
diff --git a/src/climate_downscale/extract/era5.py b/src/climate_downscale/extract/era5.py
index 12f0535..83c4108 100644
--- a/src/climate_downscale/extract/era5.py
+++ b/src/climate_downscale/extract/era5.py
@@ -11,6 +11,8 @@
 from climate_downscale import cli_options as clio
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
 
+import yaml
+
 
 def get_download_spec(
     final_out_path: Path,
@@ -30,6 +32,7 @@ def download_era5_main(
     climate_variable: str,
     year: int | str,
     month: str,
+    user: str,
 ) -> None:
     cddata = ClimateDownscaleData(output_dir)
 
@@ -45,8 +48,10 @@ def download_era5_main(
 
         print("Connecting to copernicus")
 
-        cred_path = cddata.credentials_root / "copernicus.txt"
-        url, key = cred_path.read_text().strip().split("\n")
+        cred_path = cddata.credentials_root / "copernicus.yaml"
+        credentials = yaml.safe_load(cred_path.read_text())
+        url = credentials['url']
+        key = credentials['keys'][user]
         copernicus = cdsapi.Client(url=url, key=key)
 
         print("Downloading...")
@@ -81,22 +86,35 @@ def unzip_and_compress_era5(
 ) -> None:
     cddata = ClimateDownscaleData(output_dir)
     final_out_path = cddata.era5_path(era5_dataset, climate_variable, year, month)
+    zip_path = final_out_path.with_suffix(".zip")
     uncompressed_path = final_out_path.with_stem(f"{final_out_path.stem}_raw")
-
+    
     if era5_dataset == "reanalysis-era5-land":
         print("Unzipping...")
-        # This data needs to be unzipped first.
-        zip_path = final_out_path.with_suffix(".zip")
+        # This data needs to be unzipped first.    
+        if uncompressed_path.exists():
+            uncompressed_path.unlink()
         touch(uncompressed_path)
+        try:
+            with zipfile.ZipFile(zip_path) as zf:
+                pass
+        except zipfile.BadZipFile as e:
+            # Download failed or was interrupted, delete the zipfile
+            zip_path.unlink()
+            raise e
+            
         with zipfile.ZipFile(zip_path) as zf:
             zinfo = zf.infolist()
             if len(zinfo) != 1:
                 msg = f"Expected a single file in {zip_path}"
                 raise ValueError(msg)
-            zf.extract(zinfo[0], uncompressed_path)
+            with uncompressed_path.open('wb') as f:
+                f.write(zf.read(zinfo[0]))
+        
 
+    print("Compressing")
     touch(final_out_path)
-    ds = xr.open_dataset(final_out_path)
+    ds = xr.open_dataset(uncompressed_path)
     var_name = next(iter(ds))  # These are all single variable datasets
     og_encoding = ds[var_name].encoding
     ds.to_netcdf(
@@ -109,6 +127,9 @@ def unzip_and_compress_era5(
             }
         },
     )
+    if zip_path.exists():
+        zip_path.unlink()
+    uncompressed_path.unlink()
 
 
 @click.command()  # type: ignore[arg-type]
@@ -117,12 +138,17 @@ def unzip_and_compress_era5(
 @clio.with_climate_variable()
 @clio.with_year()
 @clio.with_month()
+@click.option(
+    "--user", 
+    type=str,
+)
 def download_era5_task(
     output_dir: str,
     era5_dataset: str,
     climate_variable: str,
     year: str,
     month: str,
+    user: str,
 ) -> None:
     download_era5_main(
         output_dir,
@@ -130,6 +156,7 @@ def download_era5_task(
         climate_variable,
         year,
         month,
+        user,
     )
 
 
@@ -171,7 +198,11 @@ def extract_era5(  # noqa: PLR0913
     queue: str,
 ) -> None:
     cddata = ClimateDownscaleData(output_dir)
-
+    cred_path = cddata.credentials_root / "copernicus.yaml"
+    credentials = yaml.safe_load(cred_path.read_text())
+    users = list(credentials['keys'])
+    jobs_per_user = 20
+    
     datasets = (
         clio.VALID_ERA5_DATASETS if era5_dataset == clio.RUN_ALL else [era5_dataset]
     )
@@ -185,44 +216,72 @@ def extract_era5(  # noqa: PLR0913
 
     to_download = []
     to_compress = []
-    for dataset, variable, year, month in itertools.product(
+    complete = []
+    for spec in itertools.product(
         datasets, variables, years, months
     ):
-        final_out_path = cddata.era5_path(era5_dataset, climate_variable, year, month)
-        download_path, _ = get_download_spec(final_out_path)
+        final_out_path = cddata.era5_path(*spec)
+        download_path, _ = get_download_spec(final_out_path)        
 
         if final_out_path.exists() and download_path.exists():
             # We broke in the middle of processing this file. Don't re-download,
             # just reprocess.
             final_out_path.unlink()
-            to_compress.append((dataset, variable, year, month))
+            to_compress.append(spec)
+        elif final_out_path.exists() and final_out_path.stat().st_size == 0:
+            # Some other kind of error happened
+            final_out_path.unlink()
+            to_download.append(spec)
+            to_compress.append(spec)
+        elif download_path.exists() and download_path.stat().st_size == 0:
+            # We broke while downloading. Assume this file is invalid and re-download            
+            download_path.unlink()
+            to_download.append(spec)
+            to_compress.append(spec)    
+        elif download_path.exists():
+            to_compress.append(spec)
         elif final_out_path.exists():
-            # We've already extracted this dataset
+            # We've already extracted this dataset (deleting the download path is the last step)
+            complete.append(spec)
             continue
+        else:
+            to_download.append(spec)
+            to_compress.append(spec)
 
-        to_download.append((dataset, variable, year, month))
-        to_compress.append((dataset, variable, year, month))
+    while to_download:
+        downloads_left = len(to_download)
+        
+        
+        download_batch = []
+        for i in range(jobs_per_user):
+            for user in users:
+                if to_download:
+                    download_batch.append(
+                        (*to_download.pop(), user)
+                    )
+        assert len(download_batch) == min(len(users) * jobs_per_user, downloads_left)        
+        
+        print(len(to_download) + len(download_batch), "remaining.  Launching next", len(download_batch), "jobs")
 
-    jobmon.run_parallel(
-        runner="cdtask",
-        task_name="extract era5_download",
-        flat_node_args=(
-            ("era5-dataset", "climate-variable", "year", "month"),
-            to_compress,
-        ),
-        task_args={
-            "output-dir": output_dir,
-        },
-        task_resources={
-            "queue": queue,
-            "cores": 1,
-            "memory": "10G",
-            "runtime": "600m",
-            "project": "proj_rapidresponse",
-        },
-        max_attempts=1,
-        concurrency_limit=25,
-    )
+        jobmon.run_parallel(
+            runner="cdtask",
+            task_name="extract era5_download",
+            flat_node_args=(
+                ("era5-dataset", "climate-variable", "year", "month", "user"),
+                download_batch,
+            ),
+            task_args={
+                "output-dir": output_dir,
+            },
+            task_resources={
+                "queue": queue,
+                "cores": 1,
+                "memory": "10G",
+                "runtime": "600m",
+                "project": "proj_rapidresponse",
+            },
+            max_attempts=1,
+        )
 
     jobmon.run_parallel(
         runner="cdtask",

From 59d49323d48b44e6a43b7875143fd8bddaa0f703 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Sun, 9 Jun 2024 15:38:10 -0700
Subject: [PATCH 22/71] Port in cmip pipeline

---
 src/climate_downscale/old_climate/__init__.py |   0
 src/climate_downscale/old_climate/data.py     | 219 ++++++++++++++++++
 .../old_climate/project_anomaly.py            | 157 +++++++++++++
 .../old_climate/project_climate.py            | 172 ++++++++++++++
 4 files changed, 548 insertions(+)
 create mode 100644 src/climate_downscale/old_climate/__init__.py
 create mode 100644 src/climate_downscale/old_climate/data.py
 create mode 100644 src/climate_downscale/old_climate/project_anomaly.py
 create mode 100644 src/climate_downscale/old_climate/project_climate.py

diff --git a/src/climate_downscale/old_climate/__init__.py b/src/climate_downscale/old_climate/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/climate_downscale/old_climate/data.py b/src/climate_downscale/old_climate/data.py
new file mode 100644
index 0000000..b5422d2
--- /dev/null
+++ b/src/climate_downscale/old_climate/data.py
@@ -0,0 +1,219 @@
+import itertools
+from collections.abc import Callable
+
+import gcsfs
+import pandas as pd
+import xarray as xr
+
+
+def load_cmip_metadata(
+    tables: tuple[str, ...] = ("Amon", "day"),
+    variables: tuple[str, ...] = ("tas", "pr"),
+    experiments: tuple[str, ...] = (
+        "historical",
+        "ssp126",
+        "ssp245",
+        "ssp370",
+        "ssp585",
+    ),
+) -> pd.DataFrame:
+    """Loads CMIP6 metadata for the given tables, variables, and experiments.
+
+    Parameters
+    ----------
+    tables
+        The tables to include.
+    variables
+        The variables to include.
+    experiments
+        The experiments to include.
+
+    Returns
+    -------
+    pd.DataFrame
+        CMIP6 metadata containing only the institutions and sources with all
+        tables, variables, and experiments.
+    """
+    all_models = load_raw_cmip_metadata()
+    models_and_params = filter_institutions_and_sources(
+        all_models,
+        tables,
+        variables,
+        experiments,
+    )
+
+    # There should be no duplicates here, but there are. I'm not going to investigate
+    # why, but I'm just going to drop them.
+    member_count = models_and_params.groupby(
+        ["institution_id", "source_id", "member_id"]
+    )["activity_id"].count()
+    expected_count = len(tables) * len(variables) * len(experiments)
+    member_mask = member_count == expected_count
+
+    final_models = (
+        models_and_params.set_index(["institution_id", "source_id", "member_id"])
+        .loc[member_mask[member_mask].index]
+        .reset_index()
+    )
+
+    # Filter to the models we need for the anomaly analysis.
+    monthly_historical = (final_models["table_id"] == "Amon") & (
+        final_models["experiment_id"] == "historical"
+    )
+    daily_scenario = (final_models["table_id"] == "day") & (
+        final_models["experiment_id"] != "historical"
+    )
+    return final_models.loc[monthly_historical | daily_scenario]
+
+
+def load_cmip_historical_data(path: str) -> xr.Dataset:
+    """Loads a CMIP6 historical dataset from a zarr path.
+
+    Parameters
+    ----------
+    path
+        The path to the zarr store.
+
+    Returns
+    -------
+    xr.Dataset
+        The CMIP6 historical dataset.
+    """
+    reference_period = slice("1981-01-15", "2010-12-15")
+    return (
+        load_cmip_data(path)
+        .sel(time=reference_period)
+        .groupby("time.month")
+        .mean("time")
+    )
+
+
+def load_cmip_experiment_data(path: str, year: str) -> xr.Dataset:
+    """Loads a CMIP6 experiment dataset from a zarr path by day for a given year.
+
+    Parameters
+    ----------
+    path
+        The path to the zarr store.
+    year
+        The year to load.
+
+    Returns
+    -------
+    xr.Dataset
+        The CMIP6 experiment dataset for the given year.
+    """ ""
+    time_slice = slice(f"{year}-01", f"{year}-12")
+    time_range = pd.date_range(f"{year}-01-01", f"{year}-12-31")
+    return load_cmip_data(path).sel(time=time_slice).interp_calendar(time_range)
+
+
+##################
+# Helper methods #
+##################
+
+
+def load_raw_cmip_metadata() -> pd.DataFrame:
+    """Loads metadata containing information about all CMIP6 models."""
+    path = "https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv"
+    return pd.read_csv(path)
+
+
+def load_cmip_data(zarr_path: str) -> xr.Dataset:
+    """Loads a CMIP6 dataset from a zarr path."""
+    gcs = gcsfs.GCSFileSystem(token="anon")  # noqa: S106
+    mapper = gcs.get_mapper(zarr_path)
+    ds = xr.open_zarr(mapper, consolidated=True)
+    lon = (ds.lon + 180) % 360 - 180
+    ds = ds.assign_coords(lon=lon).sortby("lon")
+    ds = ds.drop(
+        ["lat_bnds", "lon_bnds", "time_bnds", "height", "time_bounds", "bnds"],
+        errors="ignore",
+    )
+    return ds  # type: ignore[no-any-return]
+
+
+def contains_combo(
+    table: str,
+    variable: str,
+    experiment: str,
+) -> Callable[[pd.DataFrame], bool]:
+    """Get a function to check if a dataset contains a given cmip metadata combination.
+
+    Parameters
+    ----------
+    table
+        The table to check for.
+    variable
+        The variable to check for.
+    experiment
+        The experiment to check for.
+
+    Returns
+    -------
+    Callable[[pd.DataFrame], bool]
+        A function that checks if a dataset contains a given cmip metadata combination.
+    """
+
+    def _check(df: pd.DataFrame) -> bool:
+        return (
+            df["table_id"].eq(table)
+            & df["variable_id"].eq(variable)
+            & df["experiment_id"].eq(experiment)
+        ).any()
+
+    return _check
+
+
+def filter_institutions_and_sources(
+    cmip_meta: pd.DataFrame,
+    tables: tuple[str, ...],
+    variables: tuple[str, ...],
+    experiments: tuple[str, ...],
+) -> pd.DataFrame:
+    """Filters a cmip metadata dataframe to only include models that have all
+    combinations of the given tables, variables, and experiments.
+    Parameters
+    ----------
+    cmip_meta
+        CMIP metadata dataframe.
+    tables
+        The tables to include.
+    variables
+        The variables to include.
+    experiments
+        The experiments to include.
+    Returns
+    -------
+    pd.DataFrame
+        Filtered cmip metadata containing only the institutions and sources with all
+        tables, variables, and experiments.
+    """
+    # First we filter down to all models from the institutions and sources that have
+    # all the combinations of tables, variables, and experiments.
+    masks = []
+    for table, variable, experiment in itertools.product(
+        tables, variables, experiments
+    ):
+        has_combo = cmip_meta.groupby(["institution_id", "source_id"]).apply(
+            contains_combo(table, variable, experiment)
+        )
+        masks.append(has_combo)
+    mask = pd.concat(masks, axis=1).all(axis=1)
+
+    institutions_and_sources = mask[mask].index
+    models_with_all_params = (
+        cmip_meta.set_index(["institution_id", "source_id"])
+        .loc[institutions_and_sources]
+        .reset_index()
+    )
+
+    # Now we filter down to the specific subset of table/variable/experiment
+    # combinations within the institutions and sources.
+    param_mask = (
+        models_with_all_params["table_id"].isin(tables)
+        & models_with_all_params["variable_id"].isin(variables)
+        & models_with_all_params["experiment_id"].isin(experiments)
+    )
+    models_and_params = models_with_all_params[param_mask]
+    return models_and_params
diff --git a/src/climate_downscale/old_climate/project_anomaly.py b/src/climate_downscale/old_climate/project_anomaly.py
new file mode 100644
index 0000000..6ba1dc4
--- /dev/null
+++ b/src/climate_downscale/old_climate/project_anomaly.py
@@ -0,0 +1,157 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import click
+import pandas as pd
+from rra_tools import jobmon
+
+from rra_population_pipelines.pipelines.climate import data
+from rra_population_pipelines.shared.cli_tools import options as clio
+from rra_population_pipelines.shared.data import RRA_POP
+
+if TYPE_CHECKING:
+    import xarray as xr
+
+_ENSEMBLE_MEMBERS = [
+    ("NCAR", "CESM2"),
+    ("MOHC", "UKESM1-0-LL"),
+    ("IPSL", "IPSL-CM6A-LR"),
+    ("MPI-M", "MPI-ESM1-2-LR"),
+    ("MIROC", "MIROC6"),
+    ("NOAA-GFDL", "GFDL-ESM4"),
+]
+
+_VALID_YEARS = tuple([str(y) for y in range(2015, 2101)])
+
+
+def get_run_metadata(
+    variable_id: str,
+    experiment_id: str,
+) -> pd.DataFrame:
+    metadata = data.load_cmip_metadata()
+    metadata = (
+        metadata.set_index(["institution_id", "source_id"])
+        .sort_index()
+        .loc[_ENSEMBLE_MEMBERS]
+        .reset_index()
+        .set_index(["variable_id", "experiment_id"])
+    )
+    history_meta = (
+        metadata.loc[(variable_id, "historical")]
+        .set_index(["institution_id", "source_id", "member_id"])  # type: ignore[union-attr]
+        .loc[:, "zstore"]
+    )
+    experiment_meta = (
+        metadata.loc[(variable_id, experiment_id)]
+        .set_index(["institution_id", "source_id", "member_id"])  # type: ignore[union-attr]
+        .loc[:, "zstore"]
+    )
+    final_meta = pd.concat(
+        [history_meta.rename("historical"), experiment_meta.rename("experiment")],
+        axis=1,
+    )
+    return final_meta  # type: ignore[no-any-return]
+
+
+def compute_common_lat_lon(
+    run_metadata: pd.DataFrame,
+) -> tuple[pd.Index[float], pd.Index[float]]:
+    lat = pd.Index([], name="lat", dtype=float)
+    lon = pd.Index([], name="lon", dtype=float)
+
+    for key in run_metadata.index.tolist():
+        historical = data.load_cmip_historical_data(run_metadata.at[key, "historical"])
+        lat = lat.union(historical["lat"])  # type: ignore[arg-type]
+        lon = lon.union(historical["lon"])  # type: ignore[arg-type]
+    return lat, lon
+
+
+def compute_single_model_anomaly(
+    historical: xr.Dataset,
+    experiment: xr.Dataset,
+    variable: str,
+) -> xr.Dataset:
+    if variable == "tas":
+        anomaly = experiment.groupby("time.month") - historical
+    else:
+        historical = 86400 * historical + 1
+        experiment = 86400 * experiment + 1
+        anomaly = (1 / historical) * experiment.groupby("time.month")
+    return anomaly
+
+
+def interp_common_lat_lon(
+    ds: xr.Dataset, lat: pd.Index[float], lon: pd.Index[float]
+) -> xr.Dataset:
+    return (
+        ds.pad(lon=1, mode="wrap")
+        .assign_coords(lon=ds.lon.pad(lon=1, mode="reflect", reflect_type="odd"))
+        .interp(lat=lat, lon=lon)
+    )
+
+
+def project_anomaly_main(variable: str, experiment: str, year: str) -> xr.Dataset:
+    run_meta = get_run_metadata(variable, experiment)
+    lat, lon = compute_common_lat_lon(run_meta)
+
+    anomalies: list[xr.Dataset] = []
+    for key in run_meta.index.tolist():
+        historical = data.load_cmip_historical_data(run_meta.at[key, "historical"])
+        scenario = data.load_cmip_experiment_data(
+            run_meta.at[key, "experiment"], year=year
+        )
+        anomaly = compute_single_model_anomaly(historical, scenario, variable=variable)
+        anomaly = interp_common_latin _lon(anomaly, lat, lon)
+        anomalies.append(anomaly)
+
+    mean_anomaly = 1 / len(anomalies) * sum(anomalies)
+    return mean_anomaly  # type: ignore[return-value]
+
+
+@click.command()  # type: ignore[arg-type]
+@click.option(
+    "--variable",
+    type=click.Choice(["tas", "pr"]),
+)
+@clio.with_climate_scenario(allow_all=False)
+@clio.with_year(allow_all=False, choices=_VALID_YEARS)
+@clio.with_output_directory(RRA_POP.projected_climate_anomaly_data)
+def project_anomaly_task(
+    variable: str,
+    climate_scenario: str,
+    year: str,
+    output_dir: str,
+) -> None:
+    projected_anomaly = project_anomaly_main(variable, climate_scenario, year)
+    out_path = Path(output_dir) / "{variable}_{experiment}_{year}.nc"
+    projected_anomaly.to_netcdf(out_path)
+
+
+@click.command()  # type:  ignore[arg-type]
+@clio.with_output_directory(RRA_POP.projected_climate_anomaly_data)
+@clio.with_queue()
+def project_anomaly(output_dir: str, queue: str) -> None:
+    jobmon.run_parallel(
+        task_name="project_anomaly",
+        node_args={
+            "variable": [
+                "tas",
+                "pr",
+            ],
+            "experiment": list(clio.VALID_CLIMATE_SCENARIOS),
+            "year": list(_VALID_YEARS),
+        },
+        task_args={
+            "output-dir": output_dir,
+        },
+        task_resources={
+            "queue": queue,
+            "cores": 2,
+            "memory": "70G",
+            "runtime": "120m",
+            "project": "proj_rapidresponse",
+        },
+        runner="rptask",
+    )
diff --git a/src/climate_downscale/old_climate/project_climate.py b/src/climate_downscale/old_climate/project_climate.py
new file mode 100644
index 0000000..1b366cb
--- /dev/null
+++ b/src/climate_downscale/old_climate/project_climate.py
@@ -0,0 +1,172 @@
+import click
+import pandas as pd
+import xarray as xr
+from rra_population_pipelines.shared.cli_tools import options as clio
+from rra_population_pipelines.shared.data import (
+    RRA_DATA_ROOT,
+    RRA_POP,
+    RRAPopulationData,
+)
+from rra_tools import jobmon
+
+
+def get_chelsa(variable: str, lat: slice, lon: slice) -> xr.Dataset:
+    ds_paths = [
+        RRA_POP.get_downscaled_reference_map_path(variable, month)
+        for month in range(1, 13)
+    ]
+    ds = (
+        xr.open_mfdataset(
+            ds_paths,
+            chunks={"lat": -1, "lon": -1},
+            concat_dim=[pd.Index(range(1, 13), name="month")],  # type: ignore[arg-type]
+            combine="nested",
+        )
+        .sel(lat=lat, lon=lon)
+        .rename({"Band1": variable})
+        .drop_vars("crs")
+    )
+    if variable == "tas":  # noqa: SIM108
+        ds = 0.1 * ds - 273.15
+    else:
+        ds = 0.1 * ds
+    return ds
+
+
+def load_and_downscale_anomaly(
+    variable: str,
+    scenario: str,
+    year: int,
+    lat: xr.DataArray,
+    lon: xr.DataArray,
+) -> xr.Dataset:
+    in_root = (
+        RRA_POP.human_niche_data
+        / "chelsa-downscaled-projections"
+        / "_anomalies"
+        / "GLOBAL"
+    )
+    path = in_root / f"{variable}_{scenario}_{year}.nc"
+    ds = xr.open_dataset(
+        path,
+        # Load the whole thing, but use a dask array
+        chunks={"lat": -1, "lon": -1, "time": -1},
+    ).interp(lat=lat, lon=lon)
+    return ds
+
+
+def apply_anomaly(data: xr.Dataset, anomaly: xr.Dataset) -> xr.Dataset:
+    if "tas" in anomaly.keys():  # noqa: SIM118
+        result = anomaly.groupby("time.month") + data
+    else:
+        result = anomaly.groupby("time.month") * data * (1 / 30)
+    return result
+
+
+def compute_measure(data: xr.Dataset, measure: str) -> xr.Dataset:
+    if measure == "temperature":
+        result = data.mean("time")
+    elif measure == "precipitation":
+        result = data.sum("time")
+    else:
+        threshold = 30
+        result = (data > threshold).sum("time")
+    return result
+
+
+def project_climate_main(
+    iso3: str,
+    measure: str,
+    scenario: str,
+    pop_data_dir: str,
+) -> None:
+    pop_data = RRAPopulationData(pop_data_dir)
+    admin0 = pop_data.load_shapefile(
+        admin_level=0,
+        iso3=iso3,
+        year=2022,
+    )
+    minx, miny, maxx, maxy = admin0.total_bounds
+    lat, lon = slice(miny, maxy), slice(minx, maxx)
+
+    variable = {
+        "temperature": "tas",
+        "precipitation": "pr",
+        "days_over_thirty": "tas",
+    }[measure]
+
+    print("Working on", scenario, measure)
+    ds = get_chelsa(variable, lat, lon)
+
+    results = []
+    for year in range(2015, 2101):
+        anom = load_and_downscale_anomaly(
+            variable, scenario, year, ds["lat"], ds["lon"]
+        )
+        result = apply_anomaly(ds, anom)
+        result = compute_measure(result, measure)
+        results.append(result)
+    result = xr.concat(results, dim=pd.Index(range(2015, 2101), name="year"))
+
+    print("Writing results")
+    pop_data.save_climate_data(
+        result,
+        measure=measure,
+        iso3=iso3,
+        scenario=scenario,
+    )
+
+
+@click.command()  # type: ignore[arg-type]
+@clio.with_iso3(allow_all=False)
+@click.option(
+    "--measure",
+    type=click.Choice(["temperature", "precipitation", "days_over_thirty"]),
+)
+@clio.with_climate_scenario(allow_all=False)
+@clio.with_input_directory("pop-data", RRA_DATA_ROOT)
+def project_climate_task(
+    iso3: str,
+    measure: str,
+    climate_scenario: str,
+    pop_data_dir: str,
+) -> None:
+    project_climate_main(iso3, measure, climate_scenario, pop_data_dir)
+
+
+@click.command()  # type: ignore[arg-type]
+@clio.with_iso3(allow_all=False)
+@clio.with_input_directory("pop-data", RRA_DATA_ROOT)
+@clio.with_queue()
+def project_climate(
+    iso3: str,
+    pop_data_dir: str,
+    queue: str,
+) -> None:
+    pop_data = RRAPopulationData(pop_data_dir)
+    jobmon.run_parallel(
+        task_name="project_climate",
+        node_args={
+            "iso3": [
+                iso3,
+            ],
+            "measure": [
+                "temperature",
+                "precipitation",
+                "days_over_thirty",
+            ],
+            "scenario": list(clio.VALID_CLIMATE_SCENARIOS),
+        },
+        task_args={
+            "pop-data-dir": pop_data_dir,
+        },
+        task_resources={
+            "queue": queue,
+            "cores": 2,
+            "memory": "70G",
+            "runtime": "120m",
+            "project": "proj_rapidresponse",
+        },
+        runner="rptask",
+        log_root=pop_data.climate_data,
+    )

From aa22cda4a03ea25b7bae2fa9ad5ea6bcc0cb955b Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Wed, 12 Jun 2024 11:15:21 -0700
Subject: [PATCH 23/71] Add notebook code for generating daily era5 estimates

---
 .../model/prepare_era5_daily.py               | 229 ++++++++++++++++++
 1 file changed, 229 insertions(+)
 create mode 100644 src/climate_downscale/model/prepare_era5_daily.py

diff --git a/src/climate_downscale/model/prepare_era5_daily.py b/src/climate_downscale/model/prepare_era5_daily.py
new file mode 100644
index 0000000..a33b4fc
--- /dev/null
+++ b/src/climate_downscale/model/prepare_era5_daily.py
@@ -0,0 +1,229 @@
+import pandas as pd
+import xarray as xr
+from pathlib import Path
+import numpy as np
+
+
+TARGET_LON = xr.DataArray(np.round(np.arange(0., 360., 0.1, dtype='float32'), 1), dims='longitude')
+TARGET_LAT = xr.DataArray(np.round(np.arange(90., -90.1, -0.1, dtype='float32'), 1), dims='latitude')
+
+
+def kelvin_to_celsius(temperature_k):
+    return temperature_k - 273.15
+
+def m_to_mm(ds):
+    return 1000*ds
+
+def scale_windspeed(windspeed):
+    """Scaling wind speed from a height of 10 meters to a height of 2 meters
+
+    Reference: Bröde et al. (2012)
+    https://doi.org/10.1007/s00484-011-0454-1
+
+    Parameters
+    ----------
+    ds
+        The 10m wind speed [m/s]. May be signed (ie a velocity component)
+
+    Returnds
+    --------
+    xr.DataSet
+        The 2m wind speed [m/s]. May be signed (ie a velocity component)
+    """
+    scale_factor = np.log10(2 / 0.01) / np.log10(10 / 0.01)
+    return scale_factor * windspeed
+
+def identity(ds):
+    return ds
+
+def rename_val_column(ds):
+    data_var = next(iter(ds))
+    return ds.rename({data_var: "value"})
+    
+
+convert_map = {
+    "10m_u_component_of_wind": scale_windspeed,
+    "10m_v_component_of_wind": scale_windspeed,
+    "2m_dewpoint_temperature": kelvin_to_celsius,
+    "2m_temperature": kelvin_to_celsius,    
+    "surface_net_solar_radiation": identity,
+    "surface_net_thermal_radiation": identity,
+    "surface_pressure": identity,
+    "surface_solar_radiation_downwards": identity,
+    "surface_thermal_radiation_downwards": identity,
+    "total_precipitation": m_to_mm,
+    "total_sky_direct_solar_radiation_at_surface": identity,
+}
+
+def interpolate_to_target(ds):
+    return (
+        ds
+        .interp(longitude=TARGET_LON, latitude=TARGET_LAT, method='nearest')
+        .interpolate_na(dim="longitude", method="nearest", fill_value="extrapolate")
+    )
+
+def load_variable(variable, year, month, dataset='single-levels'):
+    root = Path("/mnt/share/erf/climate_downscale/extracted_data/era5")
+    p = root / f"reanalysis-era5-{dataset}_{variable}_{year}_{month}.nc"
+    if dataset == 'land' and not p.exists():
+        # Substitute the single level dataset pre-interpolated at the target resolution.
+        p = root / f"reanalysis-era5-single-levels_{source_variable}_{year}_{month}.nc"
+        ds = interpolate_to_target(xr.load_dataset(p))
+    elif dataset == 'land':
+        ds = xr.load_dataset(p).assign_coords(latitude=TARGET_LAT, longitude=TARGET_LON)
+    else:
+        ds = xr.load_dataset(p)
+    conversion = convert_map[variable]
+    ds = conversion(rename_val_column(ds))    
+    return ds
+        
+
+########
+    
+def daily_mean(ds):
+    return ds.groupby('time.date').mean()
+
+def daily_max(ds):
+    return ds.groupby('time.date').max()
+
+def daily_min(ds):
+    return ds.groupby('time.date').min()
+
+def daily_sum(ds):
+    return ds.groupby('time.date').sum()
+
+def cdd(temperature_c):
+    return np.maximum(temperature_c - 18, 0).groupby("time.date").mean()
+
+def hdd(temperature_c):
+    return np.maximum(18 - temperature_c, 0).groupby("time.date").mean()
+
+def vector_magnitude(x, y):
+    return np.sqrt(x**2 + y**2)
+
+
+def buck_vapor_presure(temperature_c):
+    """Approximate vapor pressure of water.
+
+    https://en.wikipedia.org/wiki/Arden_Buck_equation
+    https://journals.ametsoc.org/view/journals/apme/20/12/1520-0450_1981_020_1527_nefcvp_2_0_co_2.xml
+    """
+    over_water = 6.1121 * np.exp((18.678 - temperature_c / 234.5) * (temperature_c / (257.14 + temperature_c)))
+    over_ice = 6.1115 * np.exp((23.036 - temperature_c / 333.7) * (temperature_c / (279.82 + temperature_c)))
+    return xr.where(temperature_c > 0, over_water, over_ice)
+
+def rh_percent(temperature_c, dewpoint_temperature_c):
+    # saturated vapour pressure
+    es = buck_vapor_pressure(temperature_c)
+    # vapour pressure
+    e = buck_vapor_pressure(dewpoint_temperature_c)
+    rh = (e / es) * 100
+    return rh
+
+def heat_index(temperature_c, dewpoint_temperature_c):
+    t = temperature_c  # Alias for simplicity in the formula
+    r = rh_percent(temperature_c, dewpoint_temperature_c)
+    
+    hi_raw = (
+        -8.784695
+        + 1.61139411 * t
+        + 2.338549 * r
+        - 0.14611605 * t * r
+        - 1.2308094e-2 * t**2
+        - 1.6424828e-2 * r**2
+        + 2.211732e-3 * t**2 * r
+        + 7.2546e-4 * t * r**2
+        - 3.582e-6 * t**2 * r**2
+    )
+    hi = xr.where(t > 20, hi_raw, t)
+    return hi
+
+def humidex(temperature_c, dewpoint_temperature_c):
+    vp = buck_vapor_pressure(dewpoint_temperature_c)
+    return temperature_c + 0.5555 * (vp - 10)
+
+def effective_temperature(temperature_c, dewpoint_temperature_c, uas, vas):
+    """https://www.sciencedirect.com/topics/engineering/effective-temperature"""
+    t = temperature_c
+    r = rh_percent(temperature_c, dewpoint_temperature_c)
+    v = vector_magnitude(uas, vas)    
+
+    wind_adjustment = 1 / (1.76 + 1.4 * v**0.75)
+    et = (
+        37
+        - ((37 - t) / (0.68 - 0.0014 * r + wind_adjustment))
+        - 0.29 * t * (1 - 0.01 * r)
+    )
+    return et
+    
+    
+    
+
+
+collapse_map = {
+    "mean_temperature": (["2m_temperature"], daily_mean, (273.15, 0.01)),
+    "max_temperature": (["2m_temperature"], daily_max, (273.15, 0.01)),
+    "min_temperature": (["2m_temperature"], daily_min, (273.15, 0.01)),    
+    "cooling_degree_days": (["2m_temperature"], cdd, (0, 0.01)),
+    "heating_degree_days": (["2m_temperature"], hdd, (0, 0.01)),
+    "wind_speed": (
+        ["10m_u_component_of_wind", "10m_v_component_of_wind"], lambda x, y: daily_mean(vector_magnitude(x, y)), (0, 0.01)
+    ),
+    "relative_humidity": (
+        ["2m_temperature", "2m_dewpoint_temperature"], lambda x, y: daily_mean(rh_percent(x, y)), (0, 0.01)
+    ),
+    "total_precipitation": (["total_precipitation"], daily_sum, (0, 0.1)),
+    # "heat_index": (
+    #     ["2m_temperature", "2m_dewpoint_temperature"], lambda x, y: daily_mean(heat_index(x, y)), (273.15, 0.01)
+    # ),
+    # "humidex": (
+    #     ['2m_temperature', '2m_dewpoint_temperature'], lambda x, y: daily_mean(humidex(x, y)), (273.15, 0.01)
+    # ),
+    # "normal_effective_temperature": (
+    #     ["2m_temperature", "2m_dewpoint_temperature", "10m_u_component_of_wind", "10m_v_component_of_wind"],
+    #     lambda *args: daily_mean(effective_temperature(*args)), (273.15, 0.01)
+    # ),
+    
+}
+
+year = "1990"
+month = "01"
+target_variable = "wind_speed"
+
+source_variables, collapse_fun, (e_offset, e_scale) = collapse_map[target_variable]
+
+print("loading single-levels")
+single_level = [
+    load_variable(sv, year, month, 'single-levels') for sv in source_variables
+]
+print('collapsing')
+ds = collapse_fun(*single_level)
+ds = ds.assign(date=pd.to_datetime(ds.date))
+
+print('interpolating')
+ds_land_res = interpolate_to_target(ds)
+
+print("loading land")
+land = [
+    load_variable(sv, year, month, 'land') for sv in source_variables
+]    
+print('collapsing')
+ds_land = collapse_fun(*land)
+ds_land = ds_land.assign(date=pd.to_datetime(ds_land.date))
+
+print('combining')
+combined = ds_land.combine_first(ds_land_res)
+
+combined.to_netcdf(
+    'compressed.nc', 
+    encoding={
+        'value': {
+            'dtype': 'int16', 
+            'add_offset': e_offset,
+            'scale_factor': e_scale, 
+            '_FillValue': -9999,
+            'zlib': True,
+            'complevel': 1,
+        }        
+    }
+)
\ No newline at end of file

From 58824e9d1a1e6003570922b98fa8e7a0084e00ca Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Wed, 12 Jun 2024 12:43:29 -0700
Subject: [PATCH 24/71] Add cmip extraction

---
 src/climate_downscale/extract/cmip.py         | 72 +++++++++++++++++++
 .../old_climate/project_anomaly.py            |  2 +-
 2 files changed, 73 insertions(+), 1 deletion(-)
 create mode 100644 src/climate_downscale/extract/cmip.py

diff --git a/src/climate_downscale/extract/cmip.py b/src/climate_downscale/extract/cmip.py
new file mode 100644
index 0000000..eefe266
--- /dev/null
+++ b/src/climate_downscale/extract/cmip.py
@@ -0,0 +1,72 @@
+def load_raw_cmip_metadata() -> pd.DataFrame:
+    """Loads metadata containing information about all CMIP6 models."""
+    path = "https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv"
+    return pd.read_csv(path)
+
+meta = load_raw_cmip_metadata()
+
+keep_sources = [
+    'CAMS-CSM1-0',
+    'CanESM5',
+    'CNRM-ESM2-1',
+    'GFDL-ESM4',
+    'GISS-E2-1-G',
+    'MIROC-ES2L',
+    'MIROC6',
+    'MRI-ESM2-0'
+]
+keep_experiments = [    
+    'ssp119',
+    'ssp126',
+    'ssp245',
+    'ssp370',
+    'ssp585',
+]
+
+keep_variables = [
+    "uas",
+    "vas",
+    "hurs",
+    "tas",
+    # "rsus",
+    # "rlus",
+    "ps",
+    # "rsds",
+    # "rlds",
+    "pr",
+    # "rsdsdiff",
+]
+
+keep_tables = [
+    #"Amon",
+    "day",
+]
+
+
+mask = (
+    meta.source_id.isin(keep_sources)
+    & meta.experiment_id.isin(keep_experiments)
+    & meta.variable_id.isin(keep_variables)
+    & meta.table_id.isin(keep_tables)
+)
+
+meta_sub = meta[mask]
+meta_sub['dummy'] = "X"
+
+pvs = ['source_id', 'experiment_id', 'variable_id']
+
+meta_sub.groupby(pvs).dummy.apply(lambda s: ",".join(s.unique().tolist())).unstack()
+
+import gcsfs
+def load_cmip_data(zarr_path: str) -> xr.Dataset:
+    """Loads a CMIP6 dataset from a zarr path."""
+    gcs = gcsfs.GCSFileSystem(token="anon")  # noqa: S106
+    mapper = gcs.get_mapper(zarr_path)
+    ds = xr.open_zarr(mapper, consolidated=True)
+    lon = (ds.lon + 180) % 360 - 180
+    ds = ds.assign_coords(lon=lon).sortby("lon")
+    ds = ds.drop_vars(
+        ["lat_bnds", "lon_bnds", "time_bnds", "height", "time_bounds", "bnds"],
+        errors="ignore",
+    )
+    return ds  # type: ignore[no-any-return]
\ No newline at end of file
diff --git a/src/climate_downscale/old_climate/project_anomaly.py b/src/climate_downscale/old_climate/project_anomaly.py
index 6ba1dc4..ae37b72 100644
--- a/src/climate_downscale/old_climate/project_anomaly.py
+++ b/src/climate_downscale/old_climate/project_anomaly.py
@@ -103,7 +103,7 @@ def project_anomaly_main(variable: str, experiment: str, year: str) -> xr.Datase
             run_meta.at[key, "experiment"], year=year
         )
         anomaly = compute_single_model_anomaly(historical, scenario, variable=variable)
-        anomaly = interp_common_latin _lon(anomaly, lat, lon)
+        anomaly = interp_common_lat_lon(anomaly, lat, lon)
         anomalies.append(anomaly)
 
     mean_anomaly = 1 / len(anomalies) * sum(anomalies)

From 790ea56e9fb8012d3e6531fb6c3180503b53a73b Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Wed, 12 Jun 2024 12:44:03 -0700
Subject: [PATCH 25/71] Add gcsfs and zarr dependencies

---
 poetry.lock    | 909 ++++++++++++++++++++++++++++++++++++++++++++++++-
 pyproject.toml |   2 +
 2 files changed, 909 insertions(+), 2 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 0d9646d..e911760 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
 
 [[package]]
 name = "affine"
@@ -15,6 +15,137 @@ files = [
 dev = ["coveralls", "flake8", "pydocstyle"]
 test = ["pytest (>=4.6)", "pytest-cov"]
 
+[[package]]
+name = "aiohttp"
+version = "3.9.5"
+description = "Async http client/server framework (asyncio)"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "aiohttp-3.9.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:fcde4c397f673fdec23e6b05ebf8d4751314fa7c24f93334bf1f1364c1c69ac7"},
+    {file = "aiohttp-3.9.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d6b3f1fabe465e819aed2c421a6743d8debbde79b6a8600739300630a01bf2c"},
+    {file = "aiohttp-3.9.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6ae79c1bc12c34082d92bf9422764f799aee4746fd7a392db46b7fd357d4a17a"},
+    {file = "aiohttp-3.9.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d3ebb9e1316ec74277d19c5f482f98cc65a73ccd5430540d6d11682cd857430"},
+    {file = "aiohttp-3.9.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84dabd95154f43a2ea80deffec9cb44d2e301e38a0c9d331cc4aa0166fe28ae3"},
+    {file = "aiohttp-3.9.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c8a02fbeca6f63cb1f0475c799679057fc9268b77075ab7cf3f1c600e81dd46b"},
+    {file = "aiohttp-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c26959ca7b75ff768e2776d8055bf9582a6267e24556bb7f7bd29e677932be72"},
+    {file = "aiohttp-3.9.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:714d4e5231fed4ba2762ed489b4aec07b2b9953cf4ee31e9871caac895a839c0"},
+    {file = "aiohttp-3.9.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e7a6a8354f1b62e15d48e04350f13e726fa08b62c3d7b8401c0a1314f02e3558"},
+    {file = "aiohttp-3.9.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:c413016880e03e69d166efb5a1a95d40f83d5a3a648d16486592c49ffb76d0db"},
+    {file = "aiohttp-3.9.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ff84aeb864e0fac81f676be9f4685f0527b660f1efdc40dcede3c251ef1e867f"},
+    {file = "aiohttp-3.9.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ad7f2919d7dac062f24d6f5fe95d401597fbb015a25771f85e692d043c9d7832"},
+    {file = "aiohttp-3.9.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:702e2c7c187c1a498a4e2b03155d52658fdd6fda882d3d7fbb891a5cf108bb10"},
+    {file = "aiohttp-3.9.5-cp310-cp310-win32.whl", hash = "sha256:67c3119f5ddc7261d47163ed86d760ddf0e625cd6246b4ed852e82159617b5fb"},
+    {file = "aiohttp-3.9.5-cp310-cp310-win_amd64.whl", hash = "sha256:471f0ef53ccedec9995287f02caf0c068732f026455f07db3f01a46e49d76bbb"},
+    {file = "aiohttp-3.9.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e0ae53e33ee7476dd3d1132f932eeb39bf6125083820049d06edcdca4381f342"},
+    {file = "aiohttp-3.9.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c088c4d70d21f8ca5c0b8b5403fe84a7bc8e024161febdd4ef04575ef35d474d"},
+    {file = "aiohttp-3.9.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:639d0042b7670222f33b0028de6b4e2fad6451462ce7df2af8aee37dcac55424"},
+    {file = "aiohttp-3.9.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f26383adb94da5e7fb388d441bf09c61e5e35f455a3217bfd790c6b6bc64b2ee"},
+    {file = "aiohttp-3.9.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:66331d00fb28dc90aa606d9a54304af76b335ae204d1836f65797d6fe27f1ca2"},
+    {file = "aiohttp-3.9.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4ff550491f5492ab5ed3533e76b8567f4b37bd2995e780a1f46bca2024223233"},
+    {file = "aiohttp-3.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f22eb3a6c1080d862befa0a89c380b4dafce29dc6cd56083f630073d102eb595"},
+    {file = "aiohttp-3.9.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a81b1143d42b66ffc40a441379387076243ef7b51019204fd3ec36b9f69e77d6"},
+    {file = "aiohttp-3.9.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f64fd07515dad67f24b6ea4a66ae2876c01031de91c93075b8093f07c0a2d93d"},
+    {file = "aiohttp-3.9.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:93e22add827447d2e26d67c9ac0161756007f152fdc5210277d00a85f6c92323"},
+    {file = "aiohttp-3.9.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:55b39c8684a46e56ef8c8d24faf02de4a2b2ac60d26cee93bc595651ff545de9"},
+    {file = "aiohttp-3.9.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4715a9b778f4293b9f8ae7a0a7cef9829f02ff8d6277a39d7f40565c737d3771"},
+    {file = "aiohttp-3.9.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:afc52b8d969eff14e069a710057d15ab9ac17cd4b6753042c407dcea0e40bf75"},
+    {file = "aiohttp-3.9.5-cp311-cp311-win32.whl", hash = "sha256:b3df71da99c98534be076196791adca8819761f0bf6e08e07fd7da25127150d6"},
+    {file = "aiohttp-3.9.5-cp311-cp311-win_amd64.whl", hash = "sha256:88e311d98cc0bf45b62fc46c66753a83445f5ab20038bcc1b8a1cc05666f428a"},
+    {file = "aiohttp-3.9.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c7a4b7a6cf5b6eb11e109a9755fd4fda7d57395f8c575e166d363b9fc3ec4678"},
+    {file = "aiohttp-3.9.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:0a158704edf0abcac8ac371fbb54044f3270bdbc93e254a82b6c82be1ef08f3c"},
+    {file = "aiohttp-3.9.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d153f652a687a8e95ad367a86a61e8d53d528b0530ef382ec5aaf533140ed00f"},
+    {file = "aiohttp-3.9.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82a6a97d9771cb48ae16979c3a3a9a18b600a8505b1115cfe354dfb2054468b4"},
+    {file = "aiohttp-3.9.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:60cdbd56f4cad9f69c35eaac0fbbdf1f77b0ff9456cebd4902f3dd1cf096464c"},
+    {file = "aiohttp-3.9.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8676e8fd73141ded15ea586de0b7cda1542960a7b9ad89b2b06428e97125d4fa"},
+    {file = "aiohttp-3.9.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da00da442a0e31f1c69d26d224e1efd3a1ca5bcbf210978a2ca7426dfcae9f58"},
+    {file = "aiohttp-3.9.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18f634d540dd099c262e9f887c8bbacc959847cfe5da7a0e2e1cf3f14dbf2daf"},
+    {file = "aiohttp-3.9.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:320e8618eda64e19d11bdb3bd04ccc0a816c17eaecb7e4945d01deee2a22f95f"},
+    {file = "aiohttp-3.9.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:2faa61a904b83142747fc6a6d7ad8fccff898c849123030f8e75d5d967fd4a81"},
+    {file = "aiohttp-3.9.5-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:8c64a6dc3fe5db7b1b4d2b5cb84c4f677768bdc340611eca673afb7cf416ef5a"},
+    {file = "aiohttp-3.9.5-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:393c7aba2b55559ef7ab791c94b44f7482a07bf7640d17b341b79081f5e5cd1a"},
+    {file = "aiohttp-3.9.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:c671dc117c2c21a1ca10c116cfcd6e3e44da7fcde37bf83b2be485ab377b25da"},
+    {file = "aiohttp-3.9.5-cp312-cp312-win32.whl", hash = "sha256:5a7ee16aab26e76add4afc45e8f8206c95d1d75540f1039b84a03c3b3800dd59"},
+    {file = "aiohttp-3.9.5-cp312-cp312-win_amd64.whl", hash = "sha256:5ca51eadbd67045396bc92a4345d1790b7301c14d1848feaac1d6a6c9289e888"},
+    {file = "aiohttp-3.9.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:694d828b5c41255e54bc2dddb51a9f5150b4eefa9886e38b52605a05d96566e8"},
+    {file = "aiohttp-3.9.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0605cc2c0088fcaae79f01c913a38611ad09ba68ff482402d3410bf59039bfb8"},
+    {file = "aiohttp-3.9.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4558e5012ee03d2638c681e156461d37b7a113fe13970d438d95d10173d25f78"},
+    {file = "aiohttp-3.9.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9dbc053ac75ccc63dc3a3cc547b98c7258ec35a215a92bd9f983e0aac95d3d5b"},
+    {file = "aiohttp-3.9.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4109adee842b90671f1b689901b948f347325045c15f46b39797ae1bf17019de"},
+    {file = "aiohttp-3.9.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a6ea1a5b409a85477fd8e5ee6ad8f0e40bf2844c270955e09360418cfd09abac"},
+    {file = "aiohttp-3.9.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3c2890ca8c59ee683fd09adf32321a40fe1cf164e3387799efb2acebf090c11"},
+    {file = "aiohttp-3.9.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3916c8692dbd9d55c523374a3b8213e628424d19116ac4308e434dbf6d95bbdd"},
+    {file = "aiohttp-3.9.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d1964eb7617907c792ca00b341b5ec3e01ae8c280825deadbbd678447b127e1"},
+    {file = "aiohttp-3.9.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d5ab8e1f6bee051a4bf6195e38a5c13e5e161cb7bad83d8854524798bd9fcd6e"},
+    {file = "aiohttp-3.9.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:52c27110f3862a1afbcb2af4281fc9fdc40327fa286c4625dfee247c3ba90156"},
+    {file = "aiohttp-3.9.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:7f64cbd44443e80094309875d4f9c71d0401e966d191c3d469cde4642bc2e031"},
+    {file = "aiohttp-3.9.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8b4f72fbb66279624bfe83fd5eb6aea0022dad8eec62b71e7bf63ee1caadeafe"},
+    {file = "aiohttp-3.9.5-cp38-cp38-win32.whl", hash = "sha256:6380c039ec52866c06d69b5c7aad5478b24ed11696f0e72f6b807cfb261453da"},
+    {file = "aiohttp-3.9.5-cp38-cp38-win_amd64.whl", hash = "sha256:da22dab31d7180f8c3ac7c7635f3bcd53808f374f6aa333fe0b0b9e14b01f91a"},
+    {file = "aiohttp-3.9.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1732102949ff6087589408d76cd6dea656b93c896b011ecafff418c9661dc4ed"},
+    {file = "aiohttp-3.9.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c6021d296318cb6f9414b48e6a439a7f5d1f665464da507e8ff640848ee2a58a"},
+    {file = "aiohttp-3.9.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:239f975589a944eeb1bad26b8b140a59a3a320067fb3cd10b75c3092405a1372"},
+    {file = "aiohttp-3.9.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b7b30258348082826d274504fbc7c849959f1989d86c29bc355107accec6cfb"},
+    {file = "aiohttp-3.9.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd2adf5c87ff6d8b277814a28a535b59e20bfea40a101db6b3bdca7e9926bc24"},
+    {file = "aiohttp-3.9.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e9a3d838441bebcf5cf442700e3963f58b5c33f015341f9ea86dcd7d503c07e2"},
+    {file = "aiohttp-3.9.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e3a1ae66e3d0c17cf65c08968a5ee3180c5a95920ec2731f53343fac9bad106"},
+    {file = "aiohttp-3.9.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c69e77370cce2d6df5d12b4e12bdcca60c47ba13d1cbbc8645dd005a20b738b"},
+    {file = "aiohttp-3.9.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0cbf56238f4bbf49dab8c2dc2e6b1b68502b1e88d335bea59b3f5b9f4c001475"},
+    {file = "aiohttp-3.9.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d1469f228cd9ffddd396d9948b8c9cd8022b6d1bf1e40c6f25b0fb90b4f893ed"},
+    {file = "aiohttp-3.9.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:45731330e754f5811c314901cebdf19dd776a44b31927fa4b4dbecab9e457b0c"},
+    {file = "aiohttp-3.9.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:3fcb4046d2904378e3aeea1df51f697b0467f2aac55d232c87ba162709478c46"},
+    {file = "aiohttp-3.9.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8cf142aa6c1a751fcb364158fd710b8a9be874b81889c2bd13aa8893197455e2"},
+    {file = "aiohttp-3.9.5-cp39-cp39-win32.whl", hash = "sha256:7b179eea70833c8dee51ec42f3b4097bd6370892fa93f510f76762105568cf09"},
+    {file = "aiohttp-3.9.5-cp39-cp39-win_amd64.whl", hash = "sha256:38d80498e2e169bc61418ff36170e0aad0cd268da8b38a17c4cf29d254a8b3f1"},
+    {file = "aiohttp-3.9.5.tar.gz", hash = "sha256:edea7d15772ceeb29db4aff55e482d4bcfb6ae160ce144f2682de02f6d693551"},
+]
+
+[package.dependencies]
+aiosignal = ">=1.1.2"
+async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
+attrs = ">=17.3.0"
+frozenlist = ">=1.1.1"
+multidict = ">=4.5,<7.0"
+yarl = ">=1.0,<2.0"
+
+[package.extras]
+speedups = ["Brotli", "aiodns", "brotlicffi"]
+
+[[package]]
+name = "aiosignal"
+version = "1.3.1"
+description = "aiosignal: a list of registered asynchronous callbacks"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"},
+    {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"},
+]
+
+[package.dependencies]
+frozenlist = ">=1.1.0"
+
+[[package]]
+name = "asciitree"
+version = "0.3.3"
+description = "Draws ASCII trees."
+optional = false
+python-versions = "*"
+files = [
+    {file = "asciitree-0.3.3.tar.gz", hash = "sha256:4aa4b9b649f85e3fcb343363d97564aa1fb62e249677f2e18a96765145cc0f6e"},
+]
+
+[[package]]
+name = "async-timeout"
+version = "4.0.3"
+description = "Timeout context manager for asyncio programs"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
+    {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
+]
+
 [[package]]
 name = "attrs"
 version = "23.2.0"
@@ -69,6 +200,17 @@ charset-normalizer = ["charset-normalizer"]
 html5lib = ["html5lib"]
 lxml = ["lxml"]
 
+[[package]]
+name = "cachetools"
+version = "5.3.3"
+description = "Extensible memoizing collections and decorators"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "cachetools-5.3.3-py3-none-any.whl", hash = "sha256:0abad1021d3f8325b2fc1d2e9c8b9c9d57b04c3932657a72465447332c24d945"},
+    {file = "cachetools-5.3.3.tar.gz", hash = "sha256:ba29e2dfa0b8b556606f097407ed1aa62080ee108ab0dc5ec9d6a723a007d105"},
+]
+
 [[package]]
 name = "cads-api-client"
 version = "1.0.0"
@@ -463,6 +605,17 @@ files = [
 docs = ["ipython", "matplotlib", "numpydoc", "sphinx"]
 tests = ["pytest", "pytest-cov", "pytest-xdist"]
 
+[[package]]
+name = "decorator"
+version = "5.1.1"
+description = "Decorators for Humans"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"},
+    {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"},
+]
+
 [[package]]
 name = "deep-translator"
 version = "1.11.4"
@@ -523,6 +676,17 @@ files = [
 [package.extras]
 test = ["pytest (>=6)"]
 
+[[package]]
+name = "fasteners"
+version = "0.19"
+description = "A python package that provides useful locks"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "fasteners-0.19-py3-none-any.whl", hash = "sha256:758819cb5d94cdedf4e836988b74de396ceacb8e2794d21f82d131fd9ee77237"},
+    {file = "fasteners-0.19.tar.gz", hash = "sha256:b4f37c3ac52d8a445af3a66bce57b33b5e90b97c696b7b984f530cf8f0ded09c"},
+]
+
 [[package]]
 name = "filelock"
 version = "3.14.0"
@@ -651,6 +815,155 @@ ufo = ["fs (>=2.2.0,<3)"]
 unicode = ["unicodedata2 (>=15.1.0)"]
 woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"]
 
+[[package]]
+name = "frozenlist"
+version = "1.4.1"
+description = "A list-like structure which implements collections.abc.MutableSequence"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f9aa1878d1083b276b0196f2dfbe00c9b7e752475ed3b682025ff20c1c1f51ac"},
+    {file = "frozenlist-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:29acab3f66f0f24674b7dc4736477bcd4bc3ad4b896f5f45379a67bce8b96868"},
+    {file = "frozenlist-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:74fb4bee6880b529a0c6560885fce4dc95936920f9f20f53d99a213f7bf66776"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:590344787a90ae57d62511dd7c736ed56b428f04cd8c161fcc5e7232c130c69a"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:068b63f23b17df8569b7fdca5517edef76171cf3897eb68beb01341131fbd2ad"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c849d495bf5154cd8da18a9eb15db127d4dba2968d88831aff6f0331ea9bd4c"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9750cc7fe1ae3b1611bb8cfc3f9ec11d532244235d75901fb6b8e42ce9229dfe"},
+    {file = "frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0633c8d5337cb5c77acbccc6357ac49a1770b8c487e5b3505c57b949b4b82e98"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:27657df69e8801be6c3638054e202a135c7f299267f1a55ed3a598934f6c0d75"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:f9a3ea26252bd92f570600098783d1371354d89d5f6b7dfd87359d669f2109b5"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:4f57dab5fe3407b6c0c1cc907ac98e8a189f9e418f3b6e54d65a718aaafe3950"},
+    {file = "frozenlist-1.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e02a0e11cf6597299b9f3bbd3f93d79217cb90cfd1411aec33848b13f5c656cc"},
+    {file = "frozenlist-1.4.1-cp310-cp310-win32.whl", hash = "sha256:a828c57f00f729620a442881cc60e57cfcec6842ba38e1b19fd3e47ac0ff8dc1"},
+    {file = "frozenlist-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:f56e2333dda1fe0f909e7cc59f021eba0d2307bc6f012a1ccf2beca6ba362439"},
+    {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a0cb6f11204443f27a1628b0e460f37fb30f624be6051d490fa7d7e26d4af3d0"},
+    {file = "frozenlist-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b46c8ae3a8f1f41a0d2ef350c0b6e65822d80772fe46b653ab6b6274f61d4a49"},
+    {file = "frozenlist-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fde5bd59ab5357e3853313127f4d3565fc7dad314a74d7b5d43c22c6a5ed2ced"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:722e1124aec435320ae01ee3ac7bec11a5d47f25d0ed6328f2273d287bc3abb0"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2471c201b70d58a0f0c1f91261542a03d9a5e088ed3dc6c160d614c01649c106"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c757a9dd70d72b076d6f68efdbb9bc943665ae954dad2801b874c8c69e185068"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f146e0911cb2f1da549fc58fc7bcd2b836a44b79ef871980d605ec392ff6b0d2"},
+    {file = "frozenlist-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9c515e7914626b2a2e1e311794b4c35720a0be87af52b79ff8e1429fc25f19"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c302220494f5c1ebeb0912ea782bcd5e2f8308037b3c7553fad0e48ebad6ad82"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:442acde1e068288a4ba7acfe05f5f343e19fac87bfc96d89eb886b0363e977ec"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:1b280e6507ea8a4fa0c0a7150b4e526a8d113989e28eaaef946cc77ffd7efc0a"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:fe1a06da377e3a1062ae5fe0926e12b84eceb8a50b350ddca72dc85015873f74"},
+    {file = "frozenlist-1.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:db9e724bebd621d9beca794f2a4ff1d26eed5965b004a97f1f1685a173b869c2"},
+    {file = "frozenlist-1.4.1-cp311-cp311-win32.whl", hash = "sha256:e774d53b1a477a67838a904131c4b0eef6b3d8a651f8b138b04f748fccfefe17"},
+    {file = "frozenlist-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:fb3c2db03683b5767dedb5769b8a40ebb47d6f7f45b1b3e3b4b51ec8ad9d9825"},
+    {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:1979bc0aeb89b33b588c51c54ab0161791149f2461ea7c7c946d95d5f93b56ae"},
+    {file = "frozenlist-1.4.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cc7b01b3754ea68a62bd77ce6020afaffb44a590c2289089289363472d13aedb"},
+    {file = "frozenlist-1.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c9c92be9fd329ac801cc420e08452b70e7aeab94ea4233a4804f0915c14eba9b"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c3894db91f5a489fc8fa6a9991820f368f0b3cbdb9cd8849547ccfab3392d86"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba60bb19387e13597fb059f32cd4d59445d7b18b69a745b8f8e5db0346f33480"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8aefbba5f69d42246543407ed2461db31006b0f76c4e32dfd6f42215a2c41d09"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:780d3a35680ced9ce682fbcf4cb9c2bad3136eeff760ab33707b71db84664e3a"},
+    {file = "frozenlist-1.4.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9acbb16f06fe7f52f441bb6f413ebae6c37baa6ef9edd49cdd567216da8600cd"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:23b701e65c7b36e4bf15546a89279bd4d8675faabc287d06bbcfac7d3c33e1e6"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:3e0153a805a98f5ada7e09826255ba99fb4f7524bb81bf6b47fb702666484ae1"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:dd9b1baec094d91bf36ec729445f7769d0d0cf6b64d04d86e45baf89e2b9059b"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:1a4471094e146b6790f61b98616ab8e44f72661879cc63fa1049d13ef711e71e"},
+    {file = "frozenlist-1.4.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5667ed53d68d91920defdf4035d1cdaa3c3121dc0b113255124bcfada1cfa1b8"},
+    {file = "frozenlist-1.4.1-cp312-cp312-win32.whl", hash = "sha256:beee944ae828747fd7cb216a70f120767fc9f4f00bacae8543c14a6831673f89"},
+    {file = "frozenlist-1.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:64536573d0a2cb6e625cf309984e2d873979709f2cf22839bf2d61790b448ad5"},
+    {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:20b51fa3f588ff2fe658663db52a41a4f7aa6c04f6201449c6c7c476bd255c0d"},
+    {file = "frozenlist-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:410478a0c562d1a5bcc2f7ea448359fcb050ed48b3c6f6f4f18c313a9bdb1826"},
+    {file = "frozenlist-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c6321c9efe29975232da3bd0af0ad216800a47e93d763ce64f291917a381b8eb"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48f6a4533887e189dae092f1cf981f2e3885175f7a0f33c91fb5b7b682b6bab6"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6eb73fa5426ea69ee0e012fb59cdc76a15b1283d6e32e4f8dc4482ec67d1194d"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbeb989b5cc29e8daf7f976b421c220f1b8c731cbf22b9130d8815418ea45887"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:32453c1de775c889eb4e22f1197fe3bdfe457d16476ea407472b9442e6295f7a"},
+    {file = "frozenlist-1.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:693945278a31f2086d9bf3df0fe8254bbeaef1fe71e1351c3bd730aa7d31c41b"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1d0ce09d36d53bbbe566fe296965b23b961764c0bcf3ce2fa45f463745c04701"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:3a670dc61eb0d0eb7080890c13de3066790f9049b47b0de04007090807c776b0"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:dca69045298ce5c11fd539682cff879cc1e664c245d1c64da929813e54241d11"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a06339f38e9ed3a64e4c4e43aec7f59084033647f908e4259d279a52d3757d09"},
+    {file = "frozenlist-1.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b7f2f9f912dca3934c1baec2e4585a674ef16fe00218d833856408c48d5beee7"},
+    {file = "frozenlist-1.4.1-cp38-cp38-win32.whl", hash = "sha256:e7004be74cbb7d9f34553a5ce5fb08be14fb33bc86f332fb71cbe5216362a497"},
+    {file = "frozenlist-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:5a7d70357e7cee13f470c7883a063aae5fe209a493c57d86eb7f5a6f910fae09"},
+    {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:bfa4a17e17ce9abf47a74ae02f32d014c5e9404b6d9ac7f729e01562bbee601e"},
+    {file = "frozenlist-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b7e3ed87d4138356775346e6845cccbe66cd9e207f3cd11d2f0b9fd13681359d"},
+    {file = "frozenlist-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c99169d4ff810155ca50b4da3b075cbde79752443117d89429595c2e8e37fed8"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edb678da49d9f72c9f6c609fbe41a5dfb9a9282f9e6a2253d5a91e0fc382d7c0"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6db4667b187a6742b33afbbaf05a7bc551ffcf1ced0000a571aedbb4aa42fc7b"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55fdc093b5a3cb41d420884cdaf37a1e74c3c37a31f46e66286d9145d2063bd0"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82e8211d69a4f4bc360ea22cd6555f8e61a1bd211d1d5d39d3d228b48c83a897"},
+    {file = "frozenlist-1.4.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89aa2c2eeb20957be2d950b85974b30a01a762f3308cd02bb15e1ad632e22dc7"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9d3e0c25a2350080e9319724dede4f31f43a6c9779be48021a7f4ebde8b2d742"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7268252af60904bf52c26173cbadc3a071cece75f873705419c8681f24d3edea"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0c250a29735d4f15321007fb02865f0e6b6a41a6b88f1f523ca1596ab5f50bd5"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:96ec70beabbd3b10e8bfe52616a13561e58fe84c0101dd031dc78f250d5128b9"},
+    {file = "frozenlist-1.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:23b2d7679b73fe0e5a4560b672a39f98dfc6f60df63823b0a9970525325b95f6"},
+    {file = "frozenlist-1.4.1-cp39-cp39-win32.whl", hash = "sha256:a7496bfe1da7fb1a4e1cc23bb67c58fab69311cc7d32b5a99c2007b4b2a0e932"},
+    {file = "frozenlist-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:e6a20a581f9ce92d389a8c7d7c3dd47c81fd5d6e655c8dddf341e14aa48659d0"},
+    {file = "frozenlist-1.4.1-py3-none-any.whl", hash = "sha256:04ced3e6a46b4cfffe20f9ae482818e34eba9b5fb0ce4056e4cc9b6e212d09b7"},
+    {file = "frozenlist-1.4.1.tar.gz", hash = "sha256:c037a86e8513059a2613aaba4d817bb90b9d9b6b69aace3ce9c877e8c8ed402b"},
+]
+
+[[package]]
+name = "fsspec"
+version = "2024.6.0"
+description = "File-system specification"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "fsspec-2024.6.0-py3-none-any.whl", hash = "sha256:58d7122eb8a1a46f7f13453187bfea4972d66bf01618d37366521b1998034cee"},
+    {file = "fsspec-2024.6.0.tar.gz", hash = "sha256:f579960a56e6d8038a9efc8f9c77279ec12e6299aa86b0769a7e9c46b94527c2"},
+]
+
+[package.extras]
+abfs = ["adlfs"]
+adl = ["adlfs"]
+arrow = ["pyarrow (>=1)"]
+dask = ["dask", "distributed"]
+dev = ["pre-commit", "ruff"]
+doc = ["numpydoc", "sphinx", "sphinx-design", "sphinx-rtd-theme", "yarl"]
+dropbox = ["dropbox", "dropboxdrivefs", "requests"]
+full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"]
+fuse = ["fusepy"]
+gcs = ["gcsfs"]
+git = ["pygit2"]
+github = ["requests"]
+gs = ["gcsfs"]
+gui = ["panel"]
+hdfs = ["pyarrow (>=1)"]
+http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)"]
+libarchive = ["libarchive-c"]
+oci = ["ocifs"]
+s3 = ["s3fs"]
+sftp = ["paramiko"]
+smb = ["smbprotocol"]
+ssh = ["paramiko"]
+test = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "numpy", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "requests"]
+test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask-expr", "dask[dataframe,test]", "moto[server] (>4,<5)", "pytest-timeout", "xarray"]
+test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"]
+tqdm = ["tqdm"]
+
+[[package]]
+name = "gcsfs"
+version = "2024.6.0"
+description = "Convenient Filesystem interface over GCS"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "gcsfs-2024.6.0-py2.py3-none-any.whl", hash = "sha256:92c9239167bd1e209b662b6f4ab71974f276118779c55360215cce5e0098ca7f"},
+    {file = "gcsfs-2024.6.0.tar.gz", hash = "sha256:27bd490d7a9dd641d5f6f4ea0b18fabdcfa6129b84ebdb22b23e3460ded1aa8c"},
+]
+
+[package.dependencies]
+aiohttp = "<4.0.0a0 || >4.0.0a0,<4.0.0a1 || >4.0.0a1"
+decorator = ">4.1.2"
+fsspec = "2024.6.0"
+google-auth = ">=1.2"
+google-auth-oauthlib = "*"
+google-cloud-storage = "*"
+requests = "*"
+
+[package.extras]
+crc = ["crcmod"]
+gcsfuse = ["fusepy"]
+
 [[package]]
 name = "geopandas"
 version = "0.14.4"
@@ -719,6 +1032,225 @@ gitdb = ">=4.0.1,<5"
 doc = ["sphinx (==4.3.2)", "sphinx-autodoc-typehints", "sphinx-rtd-theme", "sphinxcontrib-applehelp (>=1.0.2,<=1.0.4)", "sphinxcontrib-devhelp (==1.0.2)", "sphinxcontrib-htmlhelp (>=2.0.0,<=2.0.1)", "sphinxcontrib-qthelp (==1.0.3)", "sphinxcontrib-serializinghtml (==1.1.5)"]
 test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions"]
 
+[[package]]
+name = "google-api-core"
+version = "2.19.0"
+description = "Google API client core library"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "google-api-core-2.19.0.tar.gz", hash = "sha256:cf1b7c2694047886d2af1128a03ae99e391108a08804f87cfd35970e49c9cd10"},
+    {file = "google_api_core-2.19.0-py3-none-any.whl", hash = "sha256:8661eec4078c35428fd3f69a2c7ee29e342896b70f01d1a1cbcb334372dd6251"},
+]
+
+[package.dependencies]
+google-auth = ">=2.14.1,<3.0.dev0"
+googleapis-common-protos = ">=1.56.2,<2.0.dev0"
+proto-plus = ">=1.22.3,<2.0.0dev"
+protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0.dev0"
+requests = ">=2.18.0,<3.0.0.dev0"
+
+[package.extras]
+grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev)", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0)"]
+grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
+grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
+
+[[package]]
+name = "google-auth"
+version = "2.30.0"
+description = "Google Authentication Library"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "google-auth-2.30.0.tar.gz", hash = "sha256:ab630a1320f6720909ad76a7dbdb6841cdf5c66b328d690027e4867bdfb16688"},
+    {file = "google_auth-2.30.0-py2.py3-none-any.whl", hash = "sha256:8df7da660f62757388b8a7f249df13549b3373f24388cb5d2f1dd91cc18180b5"},
+]
+
+[package.dependencies]
+cachetools = ">=2.0.0,<6.0"
+pyasn1-modules = ">=0.2.1"
+rsa = ">=3.1.4,<5"
+
+[package.extras]
+aiohttp = ["aiohttp (>=3.6.2,<4.0.0.dev0)", "requests (>=2.20.0,<3.0.0.dev0)"]
+enterprise-cert = ["cryptography (==36.0.2)", "pyopenssl (==22.0.0)"]
+pyopenssl = ["cryptography (>=38.0.3)", "pyopenssl (>=20.0.0)"]
+reauth = ["pyu2f (>=0.1.5)"]
+requests = ["requests (>=2.20.0,<3.0.0.dev0)"]
+
+[[package]]
+name = "google-auth-oauthlib"
+version = "1.2.0"
+description = "Google Authentication Library"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "google-auth-oauthlib-1.2.0.tar.gz", hash = "sha256:292d2d3783349f2b0734a0a0207b1e1e322ac193c2c09d8f7c613fb7cc501ea8"},
+    {file = "google_auth_oauthlib-1.2.0-py2.py3-none-any.whl", hash = "sha256:297c1ce4cb13a99b5834c74a1fe03252e1e499716718b190f56bcb9c4abc4faf"},
+]
+
+[package.dependencies]
+google-auth = ">=2.15.0"
+requests-oauthlib = ">=0.7.0"
+
+[package.extras]
+tool = ["click (>=6.0.0)"]
+
+[[package]]
+name = "google-cloud-core"
+version = "2.4.1"
+description = "Google Cloud API client core library"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "google-cloud-core-2.4.1.tar.gz", hash = "sha256:9b7749272a812bde58fff28868d0c5e2f585b82f37e09a1f6ed2d4d10f134073"},
+    {file = "google_cloud_core-2.4.1-py2.py3-none-any.whl", hash = "sha256:a9e6a4422b9ac5c29f79a0ede9485473338e2ce78d91f2370c01e730eab22e61"},
+]
+
+[package.dependencies]
+google-api-core = ">=1.31.6,<2.0.dev0 || >2.3.0,<3.0.0dev"
+google-auth = ">=1.25.0,<3.0dev"
+
+[package.extras]
+grpc = ["grpcio (>=1.38.0,<2.0dev)", "grpcio-status (>=1.38.0,<2.0.dev0)"]
+
+[[package]]
+name = "google-cloud-storage"
+version = "2.17.0"
+description = "Google Cloud Storage API client library"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "google-cloud-storage-2.17.0.tar.gz", hash = "sha256:49378abff54ef656b52dca5ef0f2eba9aa83dc2b2c72c78714b03a1a95fe9388"},
+    {file = "google_cloud_storage-2.17.0-py2.py3-none-any.whl", hash = "sha256:5b393bc766b7a3bc6f5407b9e665b2450d36282614b7945e570b3480a456d1e1"},
+]
+
+[package.dependencies]
+google-api-core = ">=2.15.0,<3.0.0dev"
+google-auth = ">=2.26.1,<3.0dev"
+google-cloud-core = ">=2.3.0,<3.0dev"
+google-crc32c = ">=1.0,<2.0dev"
+google-resumable-media = ">=2.6.0"
+requests = ">=2.18.0,<3.0.0dev"
+
+[package.extras]
+protobuf = ["protobuf (<5.0.0dev)"]
+
+[[package]]
+name = "google-crc32c"
+version = "1.5.0"
+description = "A python wrapper of the C library 'Google CRC32C'"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "google-crc32c-1.5.0.tar.gz", hash = "sha256:89284716bc6a5a415d4eaa11b1726d2d60a0cd12aadf5439828353662ede9dd7"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:596d1f98fc70232fcb6590c439f43b350cb762fb5d61ce7b0e9db4539654cc13"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:be82c3c8cfb15b30f36768797a640e800513793d6ae1724aaaafe5bf86f8f346"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:461665ff58895f508e2866824a47bdee72497b091c730071f2b7575d5762ab65"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2096eddb4e7c7bdae4bd69ad364e55e07b8316653234a56552d9c988bd2d61b"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:116a7c3c616dd14a3de8c64a965828b197e5f2d121fedd2f8c5585c547e87b02"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5829b792bf5822fd0a6f6eb34c5f81dd074f01d570ed7f36aa101d6fc7a0a6e4"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:64e52e2b3970bd891309c113b54cf0e4384762c934d5ae56e283f9a0afcd953e"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:02ebb8bf46c13e36998aeaad1de9b48f4caf545e91d14041270d9dca767b780c"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-win32.whl", hash = "sha256:2e920d506ec85eb4ba50cd4228c2bec05642894d4c73c59b3a2fe20346bd00ee"},
+    {file = "google_crc32c-1.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:07eb3c611ce363c51a933bf6bd7f8e3878a51d124acfc89452a75120bc436289"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:cae0274952c079886567f3f4f685bcaf5708f0a23a5f5216fdab71f81a6c0273"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1034d91442ead5a95b5aaef90dbfaca8633b0247d1e41621d1e9f9db88c36298"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c42c70cd1d362284289c6273adda4c6af8039a8ae12dc451dcd61cdabb8ab57"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8485b340a6a9e76c62a7dce3c98e5f102c9219f4cfbf896a00cf48caf078d438"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77e2fd3057c9d78e225fa0a2160f96b64a824de17840351b26825b0848022906"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f583edb943cf2e09c60441b910d6a20b4d9d626c75a36c8fcac01a6c96c01183"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:a1fd716e7a01f8e717490fbe2e431d2905ab8aa598b9b12f8d10abebb36b04dd"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:72218785ce41b9cfd2fc1d6a017dc1ff7acfc4c17d01053265c41a2c0cc39b8c"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-win32.whl", hash = "sha256:66741ef4ee08ea0b2cc3c86916ab66b6aef03768525627fd6a1b34968b4e3709"},
+    {file = "google_crc32c-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:ba1eb1843304b1e5537e1fca632fa894d6f6deca8d6389636ee5b4797affb968"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:98cb4d057f285bd80d8778ebc4fde6b4d509ac3f331758fb1528b733215443ae"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd8536e902db7e365f49e7d9029283403974ccf29b13fc7028b97e2295b33556"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:19e0a019d2c4dcc5e598cd4a4bc7b008546b0358bd322537c74ad47a5386884f"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02c65b9817512edc6a4ae7c7e987fea799d2e0ee40c53ec573a692bee24de876"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6ac08d24c1f16bd2bf5eca8eaf8304812f44af5cfe5062006ec676e7e1d50afc"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3359fc442a743e870f4588fcf5dcbc1bf929df1fad8fb9905cd94e5edb02e84c"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1e986b206dae4476f41bcec1faa057851f3889503a70e1bdb2378d406223994a"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:de06adc872bcd8c2a4e0dc51250e9e65ef2ca91be023b9d13ebd67c2ba552e1e"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-win32.whl", hash = "sha256:d3515f198eaa2f0ed49f8819d5732d70698c3fa37384146079b3799b97667a94"},
+    {file = "google_crc32c-1.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:67b741654b851abafb7bc625b6d1cdd520a379074e64b6a128e3b688c3c04740"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c02ec1c5856179f171e032a31d6f8bf84e5a75c45c33b2e20a3de353b266ebd8"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:edfedb64740750e1a3b16152620220f51d58ff1b4abceb339ca92e934775c27a"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84e6e8cd997930fc66d5bb4fde61e2b62ba19d62b7abd7a69920406f9ecca946"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:024894d9d3cfbc5943f8f230e23950cd4906b2fe004c72e29b209420a1e6b05a"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:998679bf62b7fb599d2878aa3ed06b9ce688b8974893e7223c60db155f26bd8d"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:83c681c526a3439b5cf94f7420471705bbf96262f49a6fe546a6db5f687a3d4a"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:4c6fdd4fccbec90cc8a01fc00773fcd5fa28db683c116ee3cb35cd5da9ef6c37"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5ae44e10a8e3407dbe138984f21e536583f2bba1be9491239f942c2464ac0894"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:37933ec6e693e51a5b07505bd05de57eee12f3e8c32b07da7e73669398e6630a"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-win32.whl", hash = "sha256:fe70e325aa68fa4b5edf7d1a4b6f691eb04bbccac0ace68e34820d283b5f80d4"},
+    {file = "google_crc32c-1.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:74dea7751d98034887dbd821b7aae3e1d36eda111d6ca36c206c44478035709c"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c6c777a480337ac14f38564ac88ae82d4cd238bf293f0a22295b66eb89ffced7"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:759ce4851a4bb15ecabae28f4d2e18983c244eddd767f560165563bf9aefbc8d"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f13cae8cc389a440def0c8c52057f37359014ccbc9dc1f0827936bcd367c6100"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e560628513ed34759456a416bf86b54b2476c59144a9138165c9a1575801d0d9"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1674e4307fa3024fc897ca774e9c7562c957af85df55efe2988ed9056dc4e57"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:278d2ed7c16cfc075c91378c4f47924c0625f5fc84b2d50d921b18b7975bd210"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d5280312b9af0976231f9e317c20e4a61cd2f9629b7bfea6a693d1878a264ebd"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8b87e1a59c38f275c0e3676fc2ab6d59eccecfd460be267ac360cc31f7bcde96"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7c074fece789b5034b9b1404a1f8208fc2d4c6ce9decdd16e8220c5a793e6f61"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-win32.whl", hash = "sha256:7f57f14606cd1dd0f0de396e1e53824c371e9544a822648cd76c034d209b559c"},
+    {file = "google_crc32c-1.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:a2355cba1f4ad8b6988a4ca3feed5bff33f6af2d7f134852cf279c2aebfde541"},
+    {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f314013e7dcd5cf45ab1945d92e713eec788166262ae8deb2cfacd53def27325"},
+    {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b747a674c20a67343cb61d43fdd9207ce5da6a99f629c6e2541aa0e89215bcd"},
+    {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8f24ed114432de109aa9fd317278518a5af2d31ac2ea6b952b2f7782b43da091"},
+    {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8667b48e7a7ef66afba2c81e1094ef526388d35b873966d8a9a447974ed9178"},
+    {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:1c7abdac90433b09bad6c43a43af253e688c9cfc1c86d332aed13f9a7c7f65e2"},
+    {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6f998db4e71b645350b9ac28a2167e6632c239963ca9da411523bb439c5c514d"},
+    {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c99616c853bb585301df6de07ca2cadad344fd1ada6d62bb30aec05219c45d2"},
+    {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2ad40e31093a4af319dadf503b2467ccdc8f67c72e4bcba97f8c10cb078207b5"},
+    {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd67cf24a553339d5062eff51013780a00d6f97a39ca062781d06b3a73b15462"},
+    {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:398af5e3ba9cf768787eef45c803ff9614cc3e22a5b2f7d7ae116df8b11e3314"},
+    {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:b1f8133c9a275df5613a451e73f36c2aea4fe13c5c8997e22cf355ebd7bd0728"},
+    {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9ba053c5f50430a3fcfd36f75aff9caeba0440b2d076afdb79a318d6ca245f88"},
+    {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:272d3892a1e1a2dbc39cc5cde96834c236d5327e2122d3aaa19f6614531bb6eb"},
+    {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:635f5d4dd18758a1fbd1049a8e8d2fee4ffed124462d837d1a02a0e009c3ab31"},
+    {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c672d99a345849301784604bfeaeba4db0c7aae50b95be04dd651fd2a7310b93"},
+]
+
+[package.extras]
+testing = ["pytest"]
+
+[[package]]
+name = "google-resumable-media"
+version = "2.7.1"
+description = "Utilities for Google Media Downloads and Resumable Uploads"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "google-resumable-media-2.7.1.tar.gz", hash = "sha256:eae451a7b2e2cdbaaa0fd2eb00cc8a1ee5e95e16b55597359cbc3d27d7d90e33"},
+    {file = "google_resumable_media-2.7.1-py2.py3-none-any.whl", hash = "sha256:103ebc4ba331ab1bfdac0250f8033627a2cd7cde09e7ccff9181e31ba4315b2c"},
+]
+
+[package.dependencies]
+google-crc32c = ">=1.0,<2.0dev"
+
+[package.extras]
+aiohttp = ["aiohttp (>=3.6.2,<4.0.0dev)", "google-auth (>=1.22.0,<2.0dev)"]
+requests = ["requests (>=2.18.0,<3.0.0dev)"]
+
+[[package]]
+name = "googleapis-common-protos"
+version = "1.63.1"
+description = "Common protobufs used in Google APIs"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "googleapis-common-protos-1.63.1.tar.gz", hash = "sha256:c6442f7a0a6b2a80369457d79e6672bb7dcbaab88e0848302497e3ec80780a6a"},
+    {file = "googleapis_common_protos-1.63.1-py2.py3-none-any.whl", hash = "sha256:0e1c2cdfcbc354b76e4a211a35ea35d6926a835cba1377073c4861db904a1877"},
+]
+
+[package.dependencies]
+protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
+
+[package.extras]
+grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
+
 [[package]]
 name = "griffe"
 version = "0.44.0"
@@ -1232,6 +1764,105 @@ files = [
 griffe = ">=0.44"
 mkdocstrings = ">=0.24.2"
 
+[[package]]
+name = "multidict"
+version = "6.0.5"
+description = "multidict implementation"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9"},
+    {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604"},
+    {file = "multidict-6.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:411bf8515f3be9813d06004cac41ccf7d1cd46dfe233705933dd163b60e37600"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d147090048129ce3c453f0292e7697d333db95e52616b3793922945804a433c"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:215ed703caf15f578dca76ee6f6b21b7603791ae090fbf1ef9d865571039ade5"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c6390cf87ff6234643428991b7359b5f59cc15155695deb4eda5c777d2b880f"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fd81c4ebdb4f214161be351eb5bcf385426bf023041da2fd9e60681f3cebae"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3cc2ad10255f903656017363cd59436f2111443a76f996584d1077e43ee51182"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6939c95381e003f54cd4c5516740faba40cf5ad3eeff460c3ad1d3e0ea2549bf"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:220dd781e3f7af2c2c1053da9fa96d9cf3072ca58f057f4c5adaaa1cab8fc442"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:766c8f7511df26d9f11cd3a8be623e59cca73d44643abab3f8c8c07620524e4a"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c1c1496e73051918fcd4f58ff2e0f2f3066d1c76a0c6aeffd9b45d53243702cc"},
+    {file = "multidict-6.0.5-cp310-cp310-win32.whl", hash = "sha256:7afcdd1fc07befad18ec4523a782cde4e93e0a2bf71239894b8d61ee578c1319"},
+    {file = "multidict-6.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:99f60d34c048c5c2fabc766108c103612344c46e35d4ed9ae0673d33c8fb26e8"},
+    {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f285e862d2f153a70586579c15c44656f888806ed0e5b56b64489afe4a2dbfba"},
+    {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:53689bb4e102200a4fafa9de9c7c3c212ab40a7ab2c8e474491914d2305f187e"},
+    {file = "multidict-6.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:612d1156111ae11d14afaf3a0669ebf6c170dbb735e510a7438ffe2369a847fd"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7be7047bd08accdb7487737631d25735c9a04327911de89ff1b26b81745bd4e3"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de170c7b4fe6859beb8926e84f7d7d6c693dfe8e27372ce3b76f01c46e489fcf"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04bde7a7b3de05732a4eb39c94574db1ec99abb56162d6c520ad26f83267de29"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85f67aed7bb647f93e7520633d8f51d3cbc6ab96957c71272b286b2f30dc70ed"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425bf820055005bfc8aa9a0b99ccb52cc2f4070153e34b701acc98d201693733"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d3eb1ceec286eba8220c26f3b0096cf189aea7057b6e7b7a2e60ed36b373b77f"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7901c05ead4b3fb75113fb1dd33eb1253c6d3ee37ce93305acd9d38e0b5f21a4"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e0e79d91e71b9867c73323a3444724d496c037e578a0e1755ae159ba14f4f3d1"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:29bfeb0dff5cb5fdab2023a7a9947b3b4af63e9c47cae2a10ad58394b517fddc"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e030047e85cbcedbfc073f71836d62dd5dadfbe7531cae27789ff66bc551bd5e"},
+    {file = "multidict-6.0.5-cp311-cp311-win32.whl", hash = "sha256:2f4848aa3baa109e6ab81fe2006c77ed4d3cd1e0ac2c1fbddb7b1277c168788c"},
+    {file = "multidict-6.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:2faa5ae9376faba05f630d7e5e6be05be22913782b927b19d12b8145968a85ea"},
+    {file = "multidict-6.0.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:51d035609b86722963404f711db441cf7134f1889107fb171a970c9701f92e1e"},
+    {file = "multidict-6.0.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cbebcd5bcaf1eaf302617c114aa67569dd3f090dd0ce8ba9e35e9985b41ac35b"},
+    {file = "multidict-6.0.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ffc42c922dbfddb4a4c3b438eb056828719f07608af27d163191cb3e3aa6cc5"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ceb3b7e6a0135e092de86110c5a74e46bda4bd4fbfeeb3a3bcec79c0f861e450"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:79660376075cfd4b2c80f295528aa6beb2058fd289f4c9252f986751a4cd0496"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e4428b29611e989719874670fd152b6625500ad6c686d464e99f5aaeeaca175a"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d84a5c3a5f7ce6db1f999fb9438f686bc2e09d38143f2d93d8406ed2dd6b9226"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:76c0de87358b192de7ea9649beb392f107dcad9ad27276324c24c91774ca5271"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:79a6d2ba910adb2cbafc95dad936f8b9386e77c84c35bc0add315b856d7c3abb"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:92d16a3e275e38293623ebf639c471d3e03bb20b8ebb845237e0d3664914caef"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:fb616be3538599e797a2017cccca78e354c767165e8858ab5116813146041a24"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:14c2976aa9038c2629efa2c148022ed5eb4cb939e15ec7aace7ca932f48f9ba6"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:435a0984199d81ca178b9ae2c26ec3d49692d20ee29bc4c11a2a8d4514c67eda"},
+    {file = "multidict-6.0.5-cp312-cp312-win32.whl", hash = "sha256:9fe7b0653ba3d9d65cbe7698cca585bf0f8c83dbbcc710db9c90f478e175f2d5"},
+    {file = "multidict-6.0.5-cp312-cp312-win_amd64.whl", hash = "sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556"},
+    {file = "multidict-6.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:19fe01cea168585ba0f678cad6f58133db2aa14eccaf22f88e4a6dccadfad8b3"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6bf7a982604375a8d49b6cc1b781c1747f243d91b81035a9b43a2126c04766f5"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:107c0cdefe028703fb5dafe640a409cb146d44a6ae201e55b35a4af8e95457dd"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:403c0911cd5d5791605808b942c88a8155c2592e05332d2bf78f18697a5fa15e"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aeaf541ddbad8311a87dd695ed9642401131ea39ad7bc8cf3ef3967fd093b626"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e4972624066095e52b569e02b5ca97dbd7a7ddd4294bf4e7247d52635630dd83"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d946b0a9eb8aaa590df1fe082cee553ceab173e6cb5b03239716338629c50c7a"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b55358304d7a73d7bdf5de62494aaf70bd33015831ffd98bc498b433dfe5b10c"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:a3145cb08d8625b2d3fee1b2d596a8766352979c9bffe5d7833e0503d0f0b5e5"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d65f25da8e248202bd47445cec78e0025c0fe7582b23ec69c3b27a640dd7a8e3"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c9bf56195c6bbd293340ea82eafd0071cb3d450c703d2c93afb89f93b8386ccc"},
+    {file = "multidict-6.0.5-cp37-cp37m-win32.whl", hash = "sha256:69db76c09796b313331bb7048229e3bee7928eb62bab5e071e9f7fcc4879caee"},
+    {file = "multidict-6.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423"},
+    {file = "multidict-6.0.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:76f067f5121dcecf0d63a67f29080b26c43c71a98b10c701b0677e4a065fbd54"},
+    {file = "multidict-6.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b82cc8ace10ab5bd93235dfaab2021c70637005e1ac787031f4d1da63d493c1d"},
+    {file = "multidict-6.0.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cb241881eefd96b46f89b1a056187ea8e9ba14ab88ba632e68d7a2ecb7aadf7"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8e94e6912639a02ce173341ff62cc1201232ab86b8a8fcc05572741a5dc7d93"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09a892e4a9fb47331da06948690ae38eaa2426de97b4ccbfafbdcbe5c8f37ff8"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55205d03e8a598cfc688c71ca8ea5f66447164efff8869517f175ea632c7cb7b"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37b15024f864916b4951adb95d3a80c9431299080341ab9544ed148091b53f50"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2a1dee728b52b33eebff5072817176c172050d44d67befd681609b4746e1c2e"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:edd08e6f2f1a390bf137080507e44ccc086353c8e98c657e666c017718561b89"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:60d698e8179a42ec85172d12f50b1668254628425a6bd611aba022257cac1386"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:3d25f19500588cbc47dc19081d78131c32637c25804df8414463ec908631e453"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:4cc0ef8b962ac7a5e62b9e826bd0cd5040e7d401bc45a6835910ed699037a461"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:eca2e9d0cc5a889850e9bbd68e98314ada174ff6ccd1129500103df7a94a7a44"},
+    {file = "multidict-6.0.5-cp38-cp38-win32.whl", hash = "sha256:4a6a4f196f08c58c59e0b8ef8ec441d12aee4125a7d4f4fef000ccb22f8d7241"},
+    {file = "multidict-6.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:0275e35209c27a3f7951e1ce7aaf93ce0d163b28948444bec61dd7badc6d3f8c"},
+    {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e7be68734bd8c9a513f2b0cfd508802d6609da068f40dc57d4e3494cefc92929"},
+    {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1d9ea7a7e779d7a3561aade7d596649fbecfa5c08a7674b11b423783217933f9"},
+    {file = "multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ea1456df2a27c73ce51120fa2f519f1bea2f4a03a917f4a43c8707cf4cbbae1a"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf590b134eb70629e350691ecca88eac3e3b8b3c86992042fb82e3cb1830d5e1"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5c0631926c4f58e9a5ccce555ad7747d9a9f8b10619621f22f9635f069f6233e"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dce1c6912ab9ff5f179eaf6efe7365c1f425ed690b03341911bf4939ef2f3046"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0868d64af83169e4d4152ec612637a543f7a336e4a307b119e98042e852ad9c"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:141b43360bfd3bdd75f15ed811850763555a251e38b2405967f8e25fb43f7d40"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7df704ca8cf4a073334e0427ae2345323613e4df18cc224f647f251e5e75a527"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6214c5a5571802c33f80e6c84713b2c79e024995b9c5897f794b43e714daeec9"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:cd6c8fca38178e12c00418de737aef1261576bd1b6e8c6134d3e729a4e858b38"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e02021f87a5b6932fa6ce916ca004c4d441509d33bbdbeca70d05dff5e9d2479"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ebd8d160f91a764652d3e51ce0d2956b38efe37c9231cd82cfc0bed2e40b581c"},
+    {file = "multidict-6.0.5-cp39-cp39-win32.whl", hash = "sha256:04da1bb8c8dbadf2a18a452639771951c662c5ad03aefe4884775454be322c9b"},
+    {file = "multidict-6.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:d6f6d4f185481c9669b9447bf9d9cf3b95a0e9df9d169bbc17e363b7d5487755"},
+    {file = "multidict-6.0.5-py3-none-any.whl", hash = "sha256:0d63c74e3d7ab26de115c49bffc92cc77ed23395303d496eae515d4204a625e7"},
+    {file = "multidict-6.0.5.tar.gz", hash = "sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da"},
+]
+
 [[package]]
 name = "multiprocess"
 version = "0.70.16"
@@ -1388,6 +2019,46 @@ files = [
 [package.dependencies]
 setuptools = "*"
 
+[[package]]
+name = "numcodecs"
+version = "0.12.1"
+description = "A Python package providing buffer compression and transformation codecs for use in data storage and communication applications."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "numcodecs-0.12.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d37f628fe92b3699e65831d5733feca74d2e33b50ef29118ffd41c13c677210e"},
+    {file = "numcodecs-0.12.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:941b7446b68cf79f089bcfe92edaa3b154533dcbcd82474f994b28f2eedb1c60"},
+    {file = "numcodecs-0.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e79bf9d1d37199ac00a60ff3adb64757523291d19d03116832e600cac391c51"},
+    {file = "numcodecs-0.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:82d7107f80f9307235cb7e74719292d101c7ea1e393fe628817f0d635b7384f5"},
+    {file = "numcodecs-0.12.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:eeaf42768910f1c6eebf6c1bb00160728e62c9343df9e2e315dc9fe12e3f6071"},
+    {file = "numcodecs-0.12.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:135b2d47563f7b9dc5ee6ce3d1b81b0f1397f69309e909f1a35bb0f7c553d45e"},
+    {file = "numcodecs-0.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a191a8e347ecd016e5c357f2bf41fbcb026f6ffe78fff50c77ab12e96701d155"},
+    {file = "numcodecs-0.12.1-cp311-cp311-win_amd64.whl", hash = "sha256:21d8267bd4313f4d16f5b6287731d4c8ebdab236038f29ad1b0e93c9b2ca64ee"},
+    {file = "numcodecs-0.12.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:2f84df6b8693206365a5b37c005bfa9d1be486122bde683a7b6446af4b75d862"},
+    {file = "numcodecs-0.12.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:760627780a8b6afdb7f942f2a0ddaf4e31d3d7eea1d8498cf0fd3204a33c4618"},
+    {file = "numcodecs-0.12.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c258bd1d3dfa75a9b708540d23b2da43d63607f9df76dfa0309a7597d1de3b73"},
+    {file = "numcodecs-0.12.1-cp312-cp312-win_amd64.whl", hash = "sha256:e04649ea504aff858dbe294631f098fbfd671baf58bfc04fc48d746554c05d67"},
+    {file = "numcodecs-0.12.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:caf1a1e6678aab9c1e29d2109b299f7a467bd4d4c34235b1f0e082167846b88f"},
+    {file = "numcodecs-0.12.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c17687b1fd1fef68af616bc83f896035d24e40e04e91e7e6dae56379eb59fe33"},
+    {file = "numcodecs-0.12.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29dfb195f835a55c4d490fb097aac8c1bcb96c54cf1b037d9218492c95e9d8c5"},
+    {file = "numcodecs-0.12.1-cp38-cp38-win_amd64.whl", hash = "sha256:2f1ba2f4af3fd3ba65b1bcffb717fe65efe101a50a91c368f79f3101dbb1e243"},
+    {file = "numcodecs-0.12.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2fbb12a6a1abe95926f25c65e283762d63a9bf9e43c0de2c6a1a798347dfcb40"},
+    {file = "numcodecs-0.12.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f2207871868b2464dc11c513965fd99b958a9d7cde2629be7b2dc84fdaab013b"},
+    {file = "numcodecs-0.12.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abff3554a6892a89aacf7b642a044e4535499edf07aeae2f2e6e8fc08c9ba07f"},
+    {file = "numcodecs-0.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:ef964d4860d3e6b38df0633caf3e51dc850a6293fd8e93240473642681d95136"},
+    {file = "numcodecs-0.12.1.tar.gz", hash = "sha256:05d91a433733e7eef268d7e80ec226a0232da244289614a8f3826901aec1098e"},
+]
+
+[package.dependencies]
+numpy = ">=1.7"
+
+[package.extras]
+docs = ["mock", "numpydoc", "sphinx (<7.0.0)", "sphinx-issues"]
+msgpack = ["msgpack"]
+test = ["coverage", "flake8", "pytest", "pytest-cov"]
+test-extras = ["importlib-metadata"]
+zfpy = ["zfpy (>=1.0.0)"]
+
 [[package]]
 name = "numpy"
 version = "1.26.4"
@@ -1433,6 +2104,22 @@ files = [
     {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
 ]
 
+[[package]]
+name = "oauthlib"
+version = "3.2.2"
+description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "oauthlib-3.2.2-py3-none-any.whl", hash = "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca"},
+    {file = "oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918"},
+]
+
+[package.extras]
+rsa = ["cryptography (>=3.0.0)"]
+signals = ["blinker (>=1.4.0)"]
+signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
+
 [[package]]
 name = "packaging"
 version = "24.0"
@@ -1715,6 +2402,43 @@ nodeenv = ">=0.11.1"
 pyyaml = ">=5.1"
 virtualenv = ">=20.10.0"
 
+[[package]]
+name = "proto-plus"
+version = "1.23.0"
+description = "Beautiful, Pythonic protocol buffers."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "proto-plus-1.23.0.tar.gz", hash = "sha256:89075171ef11988b3fa157f5dbd8b9cf09d65fffee97e29ce403cd8defba19d2"},
+    {file = "proto_plus-1.23.0-py3-none-any.whl", hash = "sha256:a829c79e619e1cf632de091013a4173deed13a55f326ef84f05af6f50ff4c82c"},
+]
+
+[package.dependencies]
+protobuf = ">=3.19.0,<5.0.0dev"
+
+[package.extras]
+testing = ["google-api-core[grpc] (>=1.31.5)"]
+
+[[package]]
+name = "protobuf"
+version = "4.25.3"
+description = ""
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "protobuf-4.25.3-cp310-abi3-win32.whl", hash = "sha256:d4198877797a83cbfe9bffa3803602bbe1625dc30d8a097365dbc762e5790faa"},
+    {file = "protobuf-4.25.3-cp310-abi3-win_amd64.whl", hash = "sha256:209ba4cc916bab46f64e56b85b090607a676f66b473e6b762e6f1d9d591eb2e8"},
+    {file = "protobuf-4.25.3-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:f1279ab38ecbfae7e456a108c5c0681e4956d5b1090027c1de0f934dfdb4b35c"},
+    {file = "protobuf-4.25.3-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:e7cb0ae90dd83727f0c0718634ed56837bfeeee29a5f82a7514c03ee1364c019"},
+    {file = "protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:7c8daa26095f82482307bc717364e7c13f4f1c99659be82890dcfc215194554d"},
+    {file = "protobuf-4.25.3-cp38-cp38-win32.whl", hash = "sha256:f4f118245c4a087776e0a8408be33cf09f6c547442c00395fbfb116fac2f8ac2"},
+    {file = "protobuf-4.25.3-cp38-cp38-win_amd64.whl", hash = "sha256:c053062984e61144385022e53678fbded7aea14ebb3e0305ae3592fb219ccfa4"},
+    {file = "protobuf-4.25.3-cp39-cp39-win32.whl", hash = "sha256:19b270aeaa0099f16d3ca02628546b8baefe2955bbe23224aaf856134eccf1e4"},
+    {file = "protobuf-4.25.3-cp39-cp39-win_amd64.whl", hash = "sha256:e3c97a1555fd6388f857770ff8b9703083de6bf1f9274a002a332d65fbb56c8c"},
+    {file = "protobuf-4.25.3-py3-none-any.whl", hash = "sha256:f0700d54bcf45424477e46a9f0944155b46fb0639d69728739c0e47bab83f2b9"},
+    {file = "protobuf-4.25.3.tar.gz", hash = "sha256:25b5d0b42fd000320bd7830b349e3b696435f3b329810427a6bcce6a5492cc5c"},
+]
+
 [[package]]
 name = "pyarrow"
 version = "16.0.0"
@@ -1763,6 +2487,31 @@ files = [
 [package.dependencies]
 numpy = ">=1.16.6"
 
+[[package]]
+name = "pyasn1"
+version = "0.6.0"
+description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pyasn1-0.6.0-py2.py3-none-any.whl", hash = "sha256:cca4bb0f2df5504f02f6f8a775b6e416ff9b0b3b16f7ee80b5a3153d9b804473"},
+    {file = "pyasn1-0.6.0.tar.gz", hash = "sha256:3a35ab2c4b5ef98e17dfdec8ab074046fbda76e281c5a706ccd82328cfc8f64c"},
+]
+
+[[package]]
+name = "pyasn1-modules"
+version = "0.4.0"
+description = "A collection of ASN.1-based protocols modules"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pyasn1_modules-0.4.0-py3-none-any.whl", hash = "sha256:be04f15b66c206eed667e0bb5ab27e2b1855ea54a842e5037738099e8ca4ae0b"},
+    {file = "pyasn1_modules-0.4.0.tar.gz", hash = "sha256:831dbcea1b177b28c9baddf4c6d1013c24c3accd14a1873fffaa6a2e905f17b6"},
+]
+
+[package.dependencies]
+pyasn1 = ">=0.4.6,<0.7.0"
+
 [[package]]
 name = "pygments"
 version = "2.18.0"
@@ -2196,6 +2945,24 @@ urllib3 = ">=1.21.1,<3"
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
+[[package]]
+name = "requests-oauthlib"
+version = "2.0.0"
+description = "OAuthlib authentication support for Requests."
+optional = false
+python-versions = ">=3.4"
+files = [
+    {file = "requests-oauthlib-2.0.0.tar.gz", hash = "sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9"},
+    {file = "requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36"},
+]
+
+[package.dependencies]
+oauthlib = ">=3.0.0"
+requests = ">=2.0.0"
+
+[package.extras]
+rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
+
 [[package]]
 name = "rra-tools"
 version = "1.0.10"
@@ -2216,6 +2983,20 @@ pathos = ">=0.3.2,<0.4.0"
 requests = ">=2.32.2,<3.0.0"
 tqdm = ">=4.66.4,<5.0.0"
 
+[[package]]
+name = "rsa"
+version = "4.9"
+description = "Pure-Python RSA implementation"
+optional = false
+python-versions = ">=3.6,<4"
+files = [
+    {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"},
+    {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"},
+]
+
+[package.dependencies]
+pyasn1 = ">=0.1.3"
+
 [[package]]
 name = "ruff"
 version = "0.4.3"
@@ -2680,7 +3461,131 @@ io = ["cftime", "fsspec", "h5netcdf", "netCDF4", "pooch", "pydap", "scipy", "zar
 parallel = ["dask[complete]"]
 viz = ["matplotlib", "nc-time-axis", "seaborn"]
 
+[[package]]
+name = "yarl"
+version = "1.9.4"
+description = "Yet another URL library"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a8c1df72eb746f4136fe9a2e72b0c9dc1da1cbd23b5372f94b5820ff8ae30e0e"},
+    {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a3a6ed1d525bfb91b3fc9b690c5a21bb52de28c018530ad85093cc488bee2dd2"},
+    {file = "yarl-1.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c38c9ddb6103ceae4e4498f9c08fac9b590c5c71b0370f98714768e22ac6fa66"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9e09c9d74f4566e905a0b8fa668c58109f7624db96a2171f21747abc7524234"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8477c1ee4bd47c57d49621a062121c3023609f7a13b8a46953eb6c9716ca392"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5ff2c858f5f6a42c2a8e751100f237c5e869cbde669a724f2062d4c4ef93551"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:357495293086c5b6d34ca9616a43d329317feab7917518bc97a08f9e55648455"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54525ae423d7b7a8ee81ba189f131054defdb122cde31ff17477951464c1691c"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:801e9264d19643548651b9db361ce3287176671fb0117f96b5ac0ee1c3530d53"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e516dc8baf7b380e6c1c26792610230f37147bb754d6426462ab115a02944385"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:7d5aaac37d19b2904bb9dfe12cdb08c8443e7ba7d2852894ad448d4b8f442863"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:54beabb809ffcacbd9d28ac57b0db46e42a6e341a030293fb3185c409e626b8b"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bac8d525a8dbc2a1507ec731d2867025d11ceadcb4dd421423a5d42c56818541"},
+    {file = "yarl-1.9.4-cp310-cp310-win32.whl", hash = "sha256:7855426dfbddac81896b6e533ebefc0af2f132d4a47340cee6d22cac7190022d"},
+    {file = "yarl-1.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:848cd2a1df56ddbffeb375535fb62c9d1645dde33ca4d51341378b3f5954429b"},
+    {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:35a2b9396879ce32754bd457d31a51ff0a9d426fd9e0e3c33394bf4b9036b099"},
+    {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c7d56b293cc071e82532f70adcbd8b61909eec973ae9d2d1f9b233f3d943f2c"},
+    {file = "yarl-1.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d8a1c6c0be645c745a081c192e747c5de06e944a0d21245f4cf7c05e457c36e0"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b3c1ffe10069f655ea2d731808e76e0f452fc6c749bea04781daf18e6039525"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:549d19c84c55d11687ddbd47eeb348a89df9cb30e1993f1b128f4685cd0ebbf8"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7409f968456111140c1c95301cadf071bd30a81cbd7ab829169fb9e3d72eae9"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e23a6d84d9d1738dbc6e38167776107e63307dfc8ad108e580548d1f2c587f42"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d8b889777de69897406c9fb0b76cdf2fd0f31267861ae7501d93003d55f54fbe"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:03caa9507d3d3c83bca08650678e25364e1843b484f19986a527630ca376ecce"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4e9035df8d0880b2f1c7f5031f33f69e071dfe72ee9310cfc76f7b605958ceb9"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:c0ec0ed476f77db9fb29bca17f0a8fcc7bc97ad4c6c1d8959c507decb22e8572"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:ee04010f26d5102399bd17f8df8bc38dc7ccd7701dc77f4a68c5b8d733406958"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49a180c2e0743d5d6e0b4d1a9e5f633c62eca3f8a86ba5dd3c471060e352ca98"},
+    {file = "yarl-1.9.4-cp311-cp311-win32.whl", hash = "sha256:81eb57278deb6098a5b62e88ad8281b2ba09f2f1147c4767522353eaa6260b31"},
+    {file = "yarl-1.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d1d2532b340b692880261c15aee4dc94dd22ca5d61b9db9a8a361953d36410b1"},
+    {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0d2454f0aef65ea81037759be5ca9947539667eecebca092733b2eb43c965a81"},
+    {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:44d8ffbb9c06e5a7f529f38f53eda23e50d1ed33c6c869e01481d3fafa6b8142"},
+    {file = "yarl-1.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aaaea1e536f98754a6e5c56091baa1b6ce2f2700cc4a00b0d49eca8dea471074"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3777ce5536d17989c91696db1d459574e9a9bd37660ea7ee4d3344579bb6f129"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fc5fc1eeb029757349ad26bbc5880557389a03fa6ada41703db5e068881e5f2"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ea65804b5dc88dacd4a40279af0cdadcfe74b3e5b4c897aa0d81cf86927fee78"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa102d6d280a5455ad6a0f9e6d769989638718e938a6a0a2ff3f4a7ff8c62cc4"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09efe4615ada057ba2d30df871d2f668af661e971dfeedf0c159927d48bbeff0"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:008d3e808d03ef28542372d01057fd09168419cdc8f848efe2804f894ae03e51"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6f5cb257bc2ec58f437da2b37a8cd48f666db96d47b8a3115c29f316313654ff"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:992f18e0ea248ee03b5a6e8b3b4738850ae7dbb172cc41c966462801cbf62cf7"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:0e9d124c191d5b881060a9e5060627694c3bdd1fe24c5eecc8d5d7d0eb6faabc"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3986b6f41ad22988e53d5778f91855dc0399b043fc8946d4f2e68af22ee9ff10"},
+    {file = "yarl-1.9.4-cp312-cp312-win32.whl", hash = "sha256:4b21516d181cd77ebd06ce160ef8cc2a5e9ad35fb1c5930882baff5ac865eee7"},
+    {file = "yarl-1.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:a9bd00dc3bc395a662900f33f74feb3e757429e545d831eef5bb280252631984"},
+    {file = "yarl-1.9.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:63b20738b5aac74e239622d2fe30df4fca4942a86e31bf47a81a0e94c14df94f"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d7f7de27b8944f1fee2c26a88b4dabc2409d2fea7a9ed3df79b67277644e17"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c74018551e31269d56fab81a728f683667e7c28c04e807ba08f8c9e3bba32f14"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ca06675212f94e7a610e85ca36948bb8fc023e458dd6c63ef71abfd482481aa5"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aef935237d60a51a62b86249839b51345f47564208c6ee615ed2a40878dccdd"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b134fd795e2322b7684155b7855cc99409d10b2e408056db2b93b51a52accc7"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d25039a474c4c72a5ad4b52495056f843a7ff07b632c1b92ea9043a3d9950f6e"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f7d6b36dd2e029b6bcb8a13cf19664c7b8e19ab3a58e0fefbb5b8461447ed5ec"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:957b4774373cf6f709359e5c8c4a0af9f6d7875db657adb0feaf8d6cb3c3964c"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d7eeb6d22331e2fd42fce928a81c697c9ee2d51400bd1a28803965883e13cead"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6a962e04b8f91f8c4e5917e518d17958e3bdee71fd1d8b88cdce74dd0ebbf434"},
+    {file = "yarl-1.9.4-cp37-cp37m-win32.whl", hash = "sha256:f3bc6af6e2b8f92eced34ef6a96ffb248e863af20ef4fde9448cc8c9b858b749"},
+    {file = "yarl-1.9.4-cp37-cp37m-win_amd64.whl", hash = "sha256:ad4d7a90a92e528aadf4965d685c17dacff3df282db1121136c382dc0b6014d2"},
+    {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ec61d826d80fc293ed46c9dd26995921e3a82146feacd952ef0757236fc137be"},
+    {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8be9e837ea9113676e5754b43b940b50cce76d9ed7d2461df1af39a8ee674d9f"},
+    {file = "yarl-1.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bef596fdaa8f26e3d66af846bbe77057237cb6e8efff8cd7cc8dff9a62278bbf"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d47552b6e52c3319fede1b60b3de120fe83bde9b7bddad11a69fb0af7db32f1"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84fc30f71689d7fc9168b92788abc977dc8cefa806909565fc2951d02f6b7d57"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4aa9741085f635934f3a2583e16fcf62ba835719a8b2b28fb2917bb0537c1dfa"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:206a55215e6d05dbc6c98ce598a59e6fbd0c493e2de4ea6cc2f4934d5a18d130"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07574b007ee20e5c375a8fe4a0789fad26db905f9813be0f9fef5a68080de559"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5a2e2433eb9344a163aced6a5f6c9222c0786e5a9e9cac2c89f0b28433f56e23"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6ad6d10ed9b67a382b45f29ea028f92d25bc0bc1daf6c5b801b90b5aa70fb9ec"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:6fe79f998a4052d79e1c30eeb7d6c1c1056ad33300f682465e1b4e9b5a188b78"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a825ec844298c791fd28ed14ed1bffc56a98d15b8c58a20e0e08c1f5f2bea1be"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8619d6915b3b0b34420cf9b2bb6d81ef59d984cb0fde7544e9ece32b4b3043c3"},
+    {file = "yarl-1.9.4-cp38-cp38-win32.whl", hash = "sha256:686a0c2f85f83463272ddffd4deb5e591c98aac1897d65e92319f729c320eece"},
+    {file = "yarl-1.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:a00862fb23195b6b8322f7d781b0dc1d82cb3bcac346d1e38689370cc1cc398b"},
+    {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:604f31d97fa493083ea21bd9b92c419012531c4e17ea6da0f65cacdcf5d0bd27"},
+    {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8a854227cf581330ffa2c4824d96e52ee621dd571078a252c25e3a3b3d94a1b1"},
+    {file = "yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ba6f52cbc7809cd8d74604cce9c14868306ae4aa0282016b641c661f981a6e91"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6327976c7c2f4ee6816eff196e25385ccc02cb81427952414a64811037bbc8b"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8397a3817d7dcdd14bb266283cd1d6fc7264a48c186b986f32e86d86d35fbac5"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e0381b4ce23ff92f8170080c97678040fc5b08da85e9e292292aba67fdac6c34"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23d32a2594cb5d565d358a92e151315d1b2268bc10f4610d098f96b147370136"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ddb2a5c08a4eaaba605340fdee8fc08e406c56617566d9643ad8bf6852778fc7"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:26a1dc6285e03f3cc9e839a2da83bcbf31dcb0d004c72d0730e755b33466c30e"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:18580f672e44ce1238b82f7fb87d727c4a131f3a9d33a5e0e82b793362bf18b4"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:29e0f83f37610f173eb7e7b5562dd71467993495e568e708d99e9d1944f561ec"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:1f23e4fe1e8794f74b6027d7cf19dc25f8b63af1483d91d595d4a07eca1fb26c"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:db8e58b9d79200c76956cefd14d5c90af54416ff5353c5bfd7cbe58818e26ef0"},
+    {file = "yarl-1.9.4-cp39-cp39-win32.whl", hash = "sha256:c7224cab95645c7ab53791022ae77a4509472613e839dab722a72abe5a684575"},
+    {file = "yarl-1.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:824d6c50492add5da9374875ce72db7a0733b29c2394890aef23d533106e2b15"},
+    {file = "yarl-1.9.4-py3-none-any.whl", hash = "sha256:928cecb0ef9d5a7946eb6ff58417ad2fe9375762382f1bf5c55e61645f2c43ad"},
+    {file = "yarl-1.9.4.tar.gz", hash = "sha256:566db86717cf8080b99b58b083b773a908ae40f06681e87e589a976faf8246bf"},
+]
+
+[package.dependencies]
+idna = ">=2.0"
+multidict = ">=4.0"
+
+[[package]]
+name = "zarr"
+version = "2.18.2"
+description = "An implementation of chunked, compressed, N-dimensional arrays for Python"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "zarr-2.18.2-py3-none-any.whl", hash = "sha256:a638754902f97efa99b406083fdc807a0e2ccf12a949117389d2a4ba9b05df38"},
+    {file = "zarr-2.18.2.tar.gz", hash = "sha256:9bb393b8a0a38fb121dbb913b047d75db28de9890f6d644a217a73cf4ae74f47"},
+]
+
+[package.dependencies]
+asciitree = "*"
+fasteners = {version = "*", markers = "sys_platform != \"emscripten\""}
+numcodecs = ">=0.10.0"
+numpy = ">=1.23"
+
+[package.extras]
+docs = ["numcodecs[msgpack]", "numpydoc", "pydata-sphinx-theme", "sphinx", "sphinx-automodapi", "sphinx-copybutton", "sphinx-design", "sphinx-issues"]
+jupyter = ["ipytree (>=0.2.2)", "ipywidgets (>=8.0.0)", "notebook"]
+
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10, <3.13"
-content-hash = "da6f45d547ceb2940cf87d9792ce11d7115e9b11a405ab3420dce9850d2a092f"
+content-hash = "ec02c1bbf263c411dd7207aafb53a3c6e9c9fb95297c29064018296fc916d3aa"
diff --git a/pyproject.toml b/pyproject.toml
index 017e751..19c3068 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,6 +46,8 @@ netcdf4 = "^1.6.5"
 pyarrow = "^16.0.0"
 types-requests = "^2.31.0.20240406"
 types-tqdm = "^4.66.0.20240417"
+gcsfs = "^2024.6.0"
+zarr = "^2.18.2"
 
 [tool.poetry.group.dev.dependencies]
 mkdocstrings = {version = ">=0.23", extras = ["python"]}

From 5db69f3fb53a7e121e1fad9ea10aeb00f77c4430 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Wed, 12 Jun 2024 13:40:33 -0700
Subject: [PATCH 26/71] CMIP6 extraction

---
 poetry.lock                                   |  13 +-
 pyproject.toml                                |   3 +
 src/climate_downscale/cli_options.py          |  87 +++++++++++-
 src/climate_downscale/data.py                 |  13 ++
 src/climate_downscale/extract/__init__.py     |   6 +
 src/climate_downscale/extract/cmip.py         |  72 ----------
 src/climate_downscale/extract/cmip6.py        | 131 ++++++++++++++++++
 src/climate_downscale/extract/era5.py         |  90 ++++++------
 .../old_climate/project_anomaly.py            |  38 +----
 9 files changed, 294 insertions(+), 159 deletions(-)
 delete mode 100644 src/climate_downscale/extract/cmip.py
 create mode 100644 src/climate_downscale/extract/cmip6.py

diff --git a/poetry.lock b/poetry.lock
index e911760..daae923 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -3298,6 +3298,17 @@ notebook = ["ipywidgets (>=6)"]
 slack = ["slack-sdk"]
 telegram = ["requests"]
 
+[[package]]
+name = "types-pyyaml"
+version = "6.0.12.20240311"
+description = "Typing stubs for PyYAML"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "types-PyYAML-6.0.12.20240311.tar.gz", hash = "sha256:a9e0f0f88dc835739b0c1ca51ee90d04ca2a897a71af79de9aec5f38cb0a5342"},
+    {file = "types_PyYAML-6.0.12.20240311-py3-none-any.whl", hash = "sha256:b845b06a1c7e54b8e5b4c683043de0d9caf205e7434b3edc678ff2411979b8f6"},
+]
+
 [[package]]
 name = "types-requests"
 version = "2.31.0.20240406"
@@ -3588,4 +3599,4 @@ jupyter = ["ipytree (>=0.2.2)", "ipywidgets (>=8.0.0)", "notebook"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10, <3.13"
-content-hash = "ec02c1bbf263c411dd7207aafb53a3c6e9c9fb95297c29064018296fc916d3aa"
+content-hash = "d956b3098dcb83693feb9ac5cb4b39749dbd7ef6e90a8e2bd878ee7c3dc13f43"
diff --git a/pyproject.toml b/pyproject.toml
index 19c3068..b19f494 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,6 +48,7 @@ types-requests = "^2.31.0.20240406"
 types-tqdm = "^4.66.0.20240417"
 gcsfs = "^2024.6.0"
 zarr = "^2.18.2"
+types-pyyaml = "^6.0.12.20240311"
 
 [tool.poetry.group.dev.dependencies]
 mkdocstrings = {version = ">=0.23", extras = ["python"]}
@@ -95,6 +96,7 @@ ignore = [
     "E501",     # Line too long, this is autoformatted
     "PYI041",   # Use float instead of int | float; dumb rule
     "T201",     # print is fine for now.
+    "RET504",   # Unnecessary assignment before return
 ]
 
 [tool.ruff.lint.per-file-ignores]
@@ -148,6 +150,7 @@ exclude = [
  module = [
      "cdsapi.*",
      "affine.*",
+     "gcsfs.*",
  ]
  ignore_missing_imports = true
 
diff --git a/src/climate_downscale/cli_options.py b/src/climate_downscale/cli_options.py
index 8bcacfd..720b53e 100644
--- a/src/climate_downscale/cli_options.py
+++ b/src/climate_downscale/cli_options.py
@@ -49,7 +49,7 @@ def with_month(
     )
 
 
-VALID_CLIMATE_VARIABLES = [
+VALID_ERA5_VARIABLES = [
     "10m_u_component_of_wind",
     "10m_v_component_of_wind",
     "2m_dewpoint_temperature",
@@ -64,15 +64,15 @@ def with_month(
 ]
 
 
-def with_climate_variable(
+def with_era5_variable(
     *,
     allow_all: bool = False,
 ) -> ClickOption[_P, _T]:
     return with_choice(
-        "climate-variable",
+        "era5-variable",
         "x",
         allow_all=allow_all,
-        choices=VALID_CLIMATE_VARIABLES,
+        choices=VALID_ERA5_VARIABLES,
         help="Variable to extract.",
     )
 
@@ -93,6 +93,75 @@ def with_era5_dataset(
     )
 
 
+VALID_CMIP6_SOURCES = [
+    "CAMS-CSM1-0",
+    "CanESM5",
+    "CNRM-ESM2-1",
+    "GFDL-ESM4",
+    "GISS-E2-1-G",
+    "MIROC-ES2L",
+    "MIROC6",
+    "MRI-ESM2-0",
+]
+
+
+def with_cmip6_source(
+    *,
+    allow_all: bool = False,
+) -> ClickOption[_P, _T]:
+    return with_choice(
+        "cmip6-source",
+        "s",
+        allow_all=allow_all,
+        choices=VALID_CMIP6_SOURCES,
+        help="CMIP6 source to extract.",
+    )
+
+
+VALID_CMIP6_EXPERIMENTS = [
+    "ssp119",
+    "ssp126",
+    "ssp245",
+    "ssp370",
+    "ssp585",
+]
+
+
+def with_cmip6_experiment(
+    *,
+    allow_all: bool = False,
+) -> ClickOption[_P, _T]:
+    return with_choice(
+        "cmip6-experiment",
+        "e",
+        allow_all=allow_all,
+        choices=VALID_CMIP6_EXPERIMENTS,
+        help="CMIP6 experiment to extract.",
+    )
+
+
+VALID_CMIP6_VARIABLES = [
+    "uas",
+    "vas",
+    "hurs",
+    "tas",
+    "pr",
+]
+
+
+def with_cmip6_variable(
+    *,
+    allow_all: bool = False,
+) -> ClickOption[_P, _T]:
+    return with_choice(
+        "cmip6-variable",
+        "x",
+        allow_all=allow_all,
+        choices=VALID_CMIP6_VARIABLES,
+        help="CMIP6 variable to extract.",
+    )
+
+
 STRIDE = 30
 LATITUDES = [str(lat) for lat in range(-90, 90, STRIDE)]
 LONGITUDES = [str(lon) for lon in range(-180, 180, STRIDE)]
@@ -125,15 +194,21 @@ def with_lon_start(
 __all__ = [
     "VALID_YEARS",
     "VALID_MONTHS",
-    "VALID_CLIMATE_VARIABLES",
+    "VALID_ERA5_VARIABLES",
     "VALID_ERA5_DATASETS",
+    "VALID_CMIP6_SOURCES",
+    "VALID_CMIP6_EXPERIMENTS",
+    "VALID_CMIP6_VARIABLES",
     "STRIDE",
     "LATITUDES",
     "LONGITUDES",
     "with_year",
     "with_month",
-    "with_climate_variable",
+    "with_era5_variable",
     "with_era5_dataset",
+    "with_cmip6_source",
+    "with_cmip6_experiment",
+    "with_cmip6_variable",
     "with_lat_start",
     "with_lon_start",
     "with_output_directory",
diff --git a/src/climate_downscale/data.py b/src/climate_downscale/data.py
index 48ba036..03548c8 100644
--- a/src/climate_downscale/data.py
+++ b/src/climate_downscale/data.py
@@ -40,6 +40,19 @@ def load_era5(
     ) -> xr.Dataset:
         return xr.open_dataset(self.era5_path(dataset, variable, year, month))
 
+    @property
+    def cmip6(self) -> Path:
+        return self.extracted_data / "cmip6"
+
+    def load_cmip6_metadata(self) -> pd.DataFrame:
+        meta_path = self.cmip6 / "cmip6-metadata.parquet"
+        if not meta_path.exists():
+            external_path = "https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv"
+            meta = pd.read_csv(external_path)
+            touch(meta_path)
+            meta.to_parquet(meta_path)
+        return pd.read_parquet(meta_path)
+
     @property
     def era5_temperature_daily_mean(self) -> Path:
         return self.extracted_data / "era5_temperature_daily_mean"
diff --git a/src/climate_downscale/extract/__init__.py b/src/climate_downscale/extract/__init__.py
index 364bcf6..3149b40 100644
--- a/src/climate_downscale/extract/__init__.py
+++ b/src/climate_downscale/extract/__init__.py
@@ -1,3 +1,7 @@
+from climate_downscale.extract.cmip6 import (
+    extract_cmip6,
+    extract_cmip6_task,
+)
 from climate_downscale.extract.elevation import (
     extract_elevation,
     extract_elevation_task,
@@ -18,12 +22,14 @@
 RUNNERS = {
     "ncei": extract_ncei_climate_stations,
     "era5": extract_era5,
+    "cmip6": extract_cmip6,
     "lcz": extract_rub_local_climate_zones,
     "elevation": extract_elevation,
 }
 
 TASK_RUNNERS = {
     "ncei": extract_ncei_climate_stations_task,
+    "cmip6": extract_cmip6_task,
     "era5_download": download_era5_task,
     "era5_compress": unzip_and_compress_era5_task,
     "lcz": extract_rub_local_climate_zones,
diff --git a/src/climate_downscale/extract/cmip.py b/src/climate_downscale/extract/cmip.py
deleted file mode 100644
index eefe266..0000000
--- a/src/climate_downscale/extract/cmip.py
+++ /dev/null
@@ -1,72 +0,0 @@
-def load_raw_cmip_metadata() -> pd.DataFrame:
-    """Loads metadata containing information about all CMIP6 models."""
-    path = "https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv"
-    return pd.read_csv(path)
-
-meta = load_raw_cmip_metadata()
-
-keep_sources = [
-    'CAMS-CSM1-0',
-    'CanESM5',
-    'CNRM-ESM2-1',
-    'GFDL-ESM4',
-    'GISS-E2-1-G',
-    'MIROC-ES2L',
-    'MIROC6',
-    'MRI-ESM2-0'
-]
-keep_experiments = [    
-    'ssp119',
-    'ssp126',
-    'ssp245',
-    'ssp370',
-    'ssp585',
-]
-
-keep_variables = [
-    "uas",
-    "vas",
-    "hurs",
-    "tas",
-    # "rsus",
-    # "rlus",
-    "ps",
-    # "rsds",
-    # "rlds",
-    "pr",
-    # "rsdsdiff",
-]
-
-keep_tables = [
-    #"Amon",
-    "day",
-]
-
-
-mask = (
-    meta.source_id.isin(keep_sources)
-    & meta.experiment_id.isin(keep_experiments)
-    & meta.variable_id.isin(keep_variables)
-    & meta.table_id.isin(keep_tables)
-)
-
-meta_sub = meta[mask]
-meta_sub['dummy'] = "X"
-
-pvs = ['source_id', 'experiment_id', 'variable_id']
-
-meta_sub.groupby(pvs).dummy.apply(lambda s: ",".join(s.unique().tolist())).unstack()
-
-import gcsfs
-def load_cmip_data(zarr_path: str) -> xr.Dataset:
-    """Loads a CMIP6 dataset from a zarr path."""
-    gcs = gcsfs.GCSFileSystem(token="anon")  # noqa: S106
-    mapper = gcs.get_mapper(zarr_path)
-    ds = xr.open_zarr(mapper, consolidated=True)
-    lon = (ds.lon + 180) % 360 - 180
-    ds = ds.assign_coords(lon=lon).sortby("lon")
-    ds = ds.drop_vars(
-        ["lat_bnds", "lon_bnds", "time_bnds", "height", "time_bounds", "bnds"],
-        errors="ignore",
-    )
-    return ds  # type: ignore[no-any-return]
\ No newline at end of file
diff --git a/src/climate_downscale/extract/cmip6.py b/src/climate_downscale/extract/cmip6.py
new file mode 100644
index 0000000..e21f373
--- /dev/null
+++ b/src/climate_downscale/extract/cmip6.py
@@ -0,0 +1,131 @@
+from pathlib import Path
+
+import click
+import gcsfs
+import xarray as xr
+from rra_tools import jobmon
+
+from climate_downscale import cli_options as clio
+from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
+
+VARIABLE_ENCODINGS = {
+    "uas": (0.0, 0.01),
+    "vas": (0.0, 0.01),
+    "hurs": (0.0, 0.01),
+    "tas": (273.15, 0.01),
+    "pr": (0.0, 1e-9),
+}
+
+
+def load_cmip_data(zarr_path: str) -> xr.Dataset:
+    """Loads a CMIP6 dataset from a zarr path."""
+    gcs = gcsfs.GCSFileSystem(token="anon")  # noqa: S106
+    mapper = gcs.get_mapper(zarr_path)
+    ds = xr.open_zarr(mapper, consolidated=True)
+    ds = ds.drop_vars(
+        ["lat_bnds", "lon_bnds", "time_bnds", "height", "time_bounds", "bnds"],
+        errors="ignore",
+    )
+    return ds  # type: ignore[no-any-return]
+
+
+def extract_cmip6_main(
+    output_dir: str | Path,
+    cmip6_source: str,
+    cmip6_experiment: str,
+    cmip6_variable: str,
+) -> None:
+    cd_data = ClimateDownscaleData(output_dir)
+    meta = cd_data.load_cmip6_metadata()
+
+    mask = (
+        (meta.source_id == cmip6_source)
+        & (meta.experiment_id == cmip6_experiment)
+        & (meta.variable_id == cmip6_variable)
+        & (meta.table_id == "day")
+    )
+
+    meta_subset = meta[mask].set_index("member_id").zstore.to_dict()
+
+    for member, zstore_path in meta_subset.items():
+        cmip_data = load_cmip_data(zstore_path)
+        out_filename = f"{cmip6_source}_{cmip6_experiment}_{cmip6_variable}_{member}.nc"
+        out_path = cd_data.cmip6 / out_filename
+        shift, scale = VARIABLE_ENCODINGS[cmip6_variable]
+        cmip_data.to_netcdf(
+            out_path,
+            encoding={
+                cmip6_variable: {
+                    "dtype": "int16",
+                    "scale_factor": scale,
+                    "add_offset": shift,
+                    "_FillValue": -32767,
+                    "zlib": True,
+                    "complevel": 1,
+                }
+            },
+        )
+
+
+@click.command()  # type: ignore[arg-type]
+@clio.with_output_directory(DEFAULT_ROOT)
+@clio.with_cmip6_source()
+@clio.with_cmip6_experiment()
+@clio.with_cmip6_variable()
+def extract_cmip6_task(
+    output_dir: str,
+    cmip6_source: str,
+    cmip6_experiment: str,
+    cmip6_variable: str,
+) -> None:
+    extract_cmip6_main(output_dir, cmip6_source, cmip6_experiment, cmip6_variable)
+
+
+@click.command()  # type: ignore[arg-type]
+@clio.with_output_directory(DEFAULT_ROOT)
+@clio.with_cmip6_source(allow_all=True)
+@clio.with_cmip6_experiment(allow_all=True)
+@clio.with_cmip6_variable(allow_all=True)
+@clio.with_queue()
+def extract_cmip6(
+    output_dir: str,
+    cmip6_source: str,
+    cmip6_experiment: str,
+    cmip6_variable: str,
+    queue: str,
+) -> None:
+    sources = (
+        clio.VALID_CMIP6_SOURCES if cmip6_source == clio.RUN_ALL else [cmip6_source]
+    )
+    experiments = (
+        clio.VALID_CMIP6_EXPERIMENTS
+        if cmip6_experiment == clio.RUN_ALL
+        else [cmip6_experiment]
+    )
+    variables = (
+        clio.VALID_CMIP6_VARIABLES
+        if cmip6_variable == clio.RUN_ALL
+        else [cmip6_variable]
+    )
+
+    jobmon.run_parallel(
+        runner="cdtask",
+        task_name="extract_cmip6",
+        node_args={
+            "cmip6-source": sources,
+            "cmip6-experiment": experiments,
+            "cmip6-variable": variables,
+        },
+        task_args={
+            "output-dir": output_dir,
+        },
+        task_resources={
+            "queue": queue,
+            "cores": 1,
+            "memory": "10G",
+            "runtime": "120m",
+            "project": "proj_rapidresponse",
+        },
+        max_attempts=1,
+        concurrency_limit=50,
+    )
diff --git a/src/climate_downscale/extract/era5.py b/src/climate_downscale/extract/era5.py
index 83c4108..058c7d8 100644
--- a/src/climate_downscale/extract/era5.py
+++ b/src/climate_downscale/extract/era5.py
@@ -5,14 +5,13 @@
 import cdsapi
 import click
 import xarray as xr
+import yaml
 from rra_tools import jobmon
 from rra_tools.shell_tools import touch
 
 from climate_downscale import cli_options as clio
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
 
-import yaml
-
 
 def get_download_spec(
     final_out_path: Path,
@@ -29,14 +28,14 @@ def get_download_spec(
 def download_era5_main(
     output_dir: str | Path,
     era5_dataset: str,
-    climate_variable: str,
+    era5_variable: str,
     year: int | str,
     month: str,
     user: str,
 ) -> None:
     cddata = ClimateDownscaleData(output_dir)
 
-    final_out_path = cddata.era5_path(era5_dataset, climate_variable, year, month)
+    final_out_path = cddata.era5_path(era5_dataset, era5_variable, year, month)
     download_path, download_format = get_download_spec(final_out_path)
 
     if download_path.exists():
@@ -50,14 +49,14 @@ def download_era5_main(
 
         cred_path = cddata.credentials_root / "copernicus.yaml"
         credentials = yaml.safe_load(cred_path.read_text())
-        url = credentials['url']
-        key = credentials['keys'][user]
+        url = credentials["url"]
+        key = credentials["keys"][user]
         copernicus = cdsapi.Client(url=url, key=key)
 
         print("Downloading...")
         kwargs = {
             "product_type": "reanalysis",
-            "variable": climate_variable,
+            "variable": era5_variable,
             "year": year,
             "month": month,
             "day": [f"{d:02d}" for d in range(1, 32)],
@@ -71,7 +70,7 @@ def download_era5_main(
         )
         result.download(download_path)
     except Exception as e:
-        print(f"Failed to download {era5_dataset} {climate_variable} {year} {month}")
+        print(f"Failed to download {era5_dataset} {era5_variable} {year} {month}")
         if download_path.exists():
             download_path.unlink()
         raise e  # noqa: TRY201
@@ -80,18 +79,18 @@ def download_era5_main(
 def unzip_and_compress_era5(
     output_dir: str | Path,
     era5_dataset: str,
-    climate_variable: str,
+    era5_variable: str,
     year: int | str,
     month: str,
 ) -> None:
     cddata = ClimateDownscaleData(output_dir)
-    final_out_path = cddata.era5_path(era5_dataset, climate_variable, year, month)
+    final_out_path = cddata.era5_path(era5_dataset, era5_variable, year, month)
     zip_path = final_out_path.with_suffix(".zip")
     uncompressed_path = final_out_path.with_stem(f"{final_out_path.stem}_raw")
-    
+
     if era5_dataset == "reanalysis-era5-land":
         print("Unzipping...")
-        # This data needs to be unzipped first.    
+        # This data needs to be unzipped first.
         if uncompressed_path.exists():
             uncompressed_path.unlink()
         touch(uncompressed_path)
@@ -102,15 +101,14 @@ def unzip_and_compress_era5(
             # Download failed or was interrupted, delete the zipfile
             zip_path.unlink()
             raise e
-            
+
         with zipfile.ZipFile(zip_path) as zf:
             zinfo = zf.infolist()
             if len(zinfo) != 1:
                 msg = f"Expected a single file in {zip_path}"
                 raise ValueError(msg)
-            with uncompressed_path.open('wb') as f:
+            with uncompressed_path.open("wb") as f:
                 f.write(zf.read(zinfo[0]))
-        
 
     print("Compressing")
     touch(final_out_path)
@@ -135,17 +133,17 @@ def unzip_and_compress_era5(
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
 @clio.with_era5_dataset()
-@clio.with_climate_variable()
+@clio.with_era5_variable()
 @clio.with_year()
 @clio.with_month()
 @click.option(
-    "--user", 
+    "--user",
     type=str,
 )
 def download_era5_task(
     output_dir: str,
     era5_dataset: str,
-    climate_variable: str,
+    era5_variable: str,
     year: str,
     month: str,
     user: str,
@@ -153,7 +151,7 @@ def download_era5_task(
     download_era5_main(
         output_dir,
         era5_dataset,
-        climate_variable,
+        era5_variable,
         year,
         month,
         user,
@@ -163,20 +161,20 @@ def download_era5_task(
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
 @clio.with_era5_dataset()
-@clio.with_climate_variable()
+@clio.with_era5_variable()
 @clio.with_year()
 @clio.with_month()
 def unzip_and_compress_era5_task(
     output_dir: str,
     era5_dataset: str,
-    climate_variable: str,
+    era5_variable: str,
     year: str,
     month: str,
 ) -> None:
     unzip_and_compress_era5(
         output_dir,
         era5_dataset,
-        climate_variable,
+        era5_variable,
         year,
         month,
     )
@@ -185,14 +183,14 @@ def unzip_and_compress_era5_task(
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
 @clio.with_era5_dataset(allow_all=True)
-@clio.with_climate_variable(allow_all=True)
+@clio.with_era5_variable(allow_all=True)
 @clio.with_year(allow_all=True)
 @clio.with_month(allow_all=True)
 @clio.with_queue()
 def extract_era5(  # noqa: PLR0913
     output_dir: str,
     era5_dataset: str,
-    climate_variable: str,
+    era5_variable: str,
     year: str,
     month: str,
     queue: str,
@@ -200,16 +198,14 @@ def extract_era5(  # noqa: PLR0913
     cddata = ClimateDownscaleData(output_dir)
     cred_path = cddata.credentials_root / "copernicus.yaml"
     credentials = yaml.safe_load(cred_path.read_text())
-    users = list(credentials['keys'])
+    users = list(credentials["keys"])
     jobs_per_user = 20
-    
+
     datasets = (
         clio.VALID_ERA5_DATASETS if era5_dataset == clio.RUN_ALL else [era5_dataset]
     )
     variables = (
-        clio.VALID_CLIMATE_VARIABLES
-        if climate_variable == clio.RUN_ALL
-        else [climate_variable]
+        clio.VALID_ERA5_VARIABLES if era5_variable == clio.RUN_ALL else [era5_variable]
     )
     years = clio.VALID_YEARS if year == clio.RUN_ALL else [year]
     months = clio.VALID_MONTHS if month == clio.RUN_ALL else [month]
@@ -217,11 +213,9 @@ def extract_era5(  # noqa: PLR0913
     to_download = []
     to_compress = []
     complete = []
-    for spec in itertools.product(
-        datasets, variables, years, months
-    ):
+    for spec in itertools.product(datasets, variables, years, months):
         final_out_path = cddata.era5_path(*spec)
-        download_path, _ = get_download_spec(final_out_path)        
+        download_path, _ = get_download_spec(final_out_path)
 
         if final_out_path.exists() and download_path.exists():
             # We broke in the middle of processing this file. Don't re-download,
@@ -234,10 +228,10 @@ def extract_era5(  # noqa: PLR0913
             to_download.append(spec)
             to_compress.append(spec)
         elif download_path.exists() and download_path.stat().st_size == 0:
-            # We broke while downloading. Assume this file is invalid and re-download            
+            # We broke while downloading. Assume this file is invalid and re-download
             download_path.unlink()
             to_download.append(spec)
-            to_compress.append(spec)    
+            to_compress.append(spec)
         elif download_path.exists():
             to_compress.append(spec)
         elif final_out_path.exists():
@@ -250,24 +244,28 @@ def extract_era5(  # noqa: PLR0913
 
     while to_download:
         downloads_left = len(to_download)
-        
-        
+
         download_batch = []
-        for i in range(jobs_per_user):
+        for _ in range(jobs_per_user):
             for user in users:
                 if to_download:
-                    download_batch.append(
-                        (*to_download.pop(), user)
-                    )
-        assert len(download_batch) == min(len(users) * jobs_per_user, downloads_left)        
-        
-        print(len(to_download) + len(download_batch), "remaining.  Launching next", len(download_batch), "jobs")
+                    download_batch.append((*to_download.pop(), user))
+        if not len(download_batch) == min(len(users) * jobs_per_user, downloads_left):
+            msg = "Download batch size is incorrect"
+            raise ValueError(msg)
+
+        print(
+            len(to_download) + len(download_batch),
+            "remaining.  Launching next",
+            len(download_batch),
+            "jobs",
+        )
 
         jobmon.run_parallel(
             runner="cdtask",
             task_name="extract era5_download",
             flat_node_args=(
-                ("era5-dataset", "climate-variable", "year", "month", "user"),
+                ("era5-dataset", "era5-variable", "year", "month", "user"),
                 download_batch,
             ),
             task_args={
@@ -287,7 +285,7 @@ def extract_era5(  # noqa: PLR0913
         runner="cdtask",
         task_name="extract era5_compress",
         flat_node_args=(
-            ("era5-dataset", "climate-variable", "year", "month"),
+            ("era5-dataset", "era5-variable", "year", "month"),
             to_compress,
         ),
         task_args={
diff --git a/src/climate_downscale/old_climate/project_anomaly.py b/src/climate_downscale/old_climate/project_anomaly.py
index ae37b72..c35ef40 100644
--- a/src/climate_downscale/old_climate/project_anomaly.py
+++ b/src/climate_downscale/old_climate/project_anomaly.py
@@ -5,11 +5,10 @@
 
 import click
 import pandas as pd
-from rra_tools import jobmon
-
 from rra_population_pipelines.pipelines.climate import data
 from rra_population_pipelines.shared.cli_tools import options as clio
 from rra_population_pipelines.shared.data import RRA_POP
+from rra_tools import jobmon
 
 if TYPE_CHECKING:
     import xarray as xr
@@ -26,35 +25,6 @@
 _VALID_YEARS = tuple([str(y) for y in range(2015, 2101)])
 
 
-def get_run_metadata(
-    variable_id: str,
-    experiment_id: str,
-) -> pd.DataFrame:
-    metadata = data.load_cmip_metadata()
-    metadata = (
-        metadata.set_index(["institution_id", "source_id"])
-        .sort_index()
-        .loc[_ENSEMBLE_MEMBERS]
-        .reset_index()
-        .set_index(["variable_id", "experiment_id"])
-    )
-    history_meta = (
-        metadata.loc[(variable_id, "historical")]
-        .set_index(["institution_id", "source_id", "member_id"])  # type: ignore[union-attr]
-        .loc[:, "zstore"]
-    )
-    experiment_meta = (
-        metadata.loc[(variable_id, experiment_id)]
-        .set_index(["institution_id", "source_id", "member_id"])  # type: ignore[union-attr]
-        .loc[:, "zstore"]
-    )
-    final_meta = pd.concat(
-        [history_meta.rename("historical"), experiment_meta.rename("experiment")],
-        axis=1,
-    )
-    return final_meta  # type: ignore[no-any-return]
-
-
 def compute_common_lat_lon(
     run_metadata: pd.DataFrame,
 ) -> tuple[pd.Index[float], pd.Index[float]]:
@@ -62,7 +32,7 @@ def compute_common_lat_lon(
     lon = pd.Index([], name="lon", dtype=float)
 
     for key in run_metadata.index.tolist():
-        historical = data.load_cmip_historical_data(run_metadata.at[key, "historical"])
+        historical = data.load_cmip_historical_data(run_metadata.loc[key, "historical"])
         lat = lat.union(historical["lat"])  # type: ignore[arg-type]
         lon = lon.union(historical["lon"])  # type: ignore[arg-type]
     return lat, lon
@@ -98,9 +68,9 @@ def project_anomaly_main(variable: str, experiment: str, year: str) -> xr.Datase
 
     anomalies: list[xr.Dataset] = []
     for key in run_meta.index.tolist():
-        historical = data.load_cmip_historical_data(run_meta.at[key, "historical"])
+        historical = data.load_cmip_historical_data(run_meta.loc[key, "historical"])
         scenario = data.load_cmip_experiment_data(
-            run_meta.at[key, "experiment"], year=year
+            run_meta.loc[key, "experiment"], year=year
         )
         anomaly = compute_single_model_anomaly(historical, scenario, variable=variable)
         anomaly = interp_common_lat_lon(anomaly, lat, lon)

From 83d6030e411cf62188bff95b8a44781a62363ca0 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Wed, 12 Jun 2024 13:41:24 -0700
Subject: [PATCH 27/71] CMIP6 extraction

---
 src/climate_downscale/extract/cmip6.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/climate_downscale/extract/cmip6.py b/src/climate_downscale/extract/cmip6.py
index e21f373..bae994a 100644
--- a/src/climate_downscale/extract/cmip6.py
+++ b/src/climate_downscale/extract/cmip6.py
@@ -3,7 +3,7 @@
 import click
 import gcsfs
 import xarray as xr
-from rra_tools import jobmon
+from rra_tools import jobmon, shell_tools
 
 from climate_downscale import cli_options as clio
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
@@ -51,6 +51,7 @@ def extract_cmip6_main(
         cmip_data = load_cmip_data(zstore_path)
         out_filename = f"{cmip6_source}_{cmip6_experiment}_{cmip6_variable}_{member}.nc"
         out_path = cd_data.cmip6 / out_filename
+        shell_tools.touch(out_path, exist_ok=True)
         shift, scale = VARIABLE_ENCODINGS[cmip6_variable]
         cmip_data.to_netcdf(
             out_path,

From 1c6d38567937df8933d7640b0c665da5dfe75b4f Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Wed, 12 Jun 2024 13:42:29 -0700
Subject: [PATCH 28/71] typo

---
 src/climate_downscale/extract/cmip6.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/climate_downscale/extract/cmip6.py b/src/climate_downscale/extract/cmip6.py
index bae994a..500bc4f 100644
--- a/src/climate_downscale/extract/cmip6.py
+++ b/src/climate_downscale/extract/cmip6.py
@@ -111,7 +111,7 @@ def extract_cmip6(
 
     jobmon.run_parallel(
         runner="cdtask",
-        task_name="extract_cmip6",
+        task_name="extract cmip6",
         node_args={
             "cmip6-source": sources,
             "cmip6-experiment": experiments,

From 7ee36b309ac077ec749691618d0dc4ee82ee9950 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Wed, 12 Jun 2024 13:44:36 -0700
Subject: [PATCH 29/71] some logging

---
 src/climate_downscale/extract/cmip6.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/climate_downscale/extract/cmip6.py b/src/climate_downscale/extract/cmip6.py
index 500bc4f..1437984 100644
--- a/src/climate_downscale/extract/cmip6.py
+++ b/src/climate_downscale/extract/cmip6.py
@@ -35,6 +35,7 @@ def extract_cmip6_main(
     cmip6_experiment: str,
     cmip6_variable: str,
 ) -> None:
+    print(f'Checking metadata for {cmip6_source} {cmip6_experiment} {cmip6_variable}')
     cd_data = ClimateDownscaleData(output_dir)
     meta = cd_data.load_cmip6_metadata()
 
@@ -46,6 +47,7 @@ def extract_cmip6_main(
     )
 
     meta_subset = meta[mask].set_index("member_id").zstore.to_dict()
+    print(f'Extracting {len(meta_subset)} members...')
 
     for member, zstore_path in meta_subset.items():
         cmip_data = load_cmip_data(zstore_path)

From 593a2a7d6699e51458b55e87230f56ffce71f445 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Wed, 12 Jun 2024 13:45:27 -0700
Subject: [PATCH 30/71] some logging

---
 src/climate_downscale/extract/cmip6.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/climate_downscale/extract/cmip6.py b/src/climate_downscale/extract/cmip6.py
index 1437984..5b25e30 100644
--- a/src/climate_downscale/extract/cmip6.py
+++ b/src/climate_downscale/extract/cmip6.py
@@ -50,11 +50,13 @@ def extract_cmip6_main(
     print(f'Extracting {len(meta_subset)} members...')
 
     for member, zstore_path in meta_subset.items():
+        print('Extracting', member, zstore_path)
         cmip_data = load_cmip_data(zstore_path)
         out_filename = f"{cmip6_source}_{cmip6_experiment}_{cmip6_variable}_{member}.nc"
         out_path = cd_data.cmip6 / out_filename
         shell_tools.touch(out_path, exist_ok=True)
         shift, scale = VARIABLE_ENCODINGS[cmip6_variable]
+        print('Writing to', out_path)
         cmip_data.to_netcdf(
             out_path,
             encoding={

From 0de6d2fce871d195cf2039d23ea7a2bd8e493379 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Wed, 12 Jun 2024 13:47:30 -0700
Subject: [PATCH 31/71] Change naming scheme

---
 src/climate_downscale/extract/cmip6.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/climate_downscale/extract/cmip6.py b/src/climate_downscale/extract/cmip6.py
index 5b25e30..59c6c96 100644
--- a/src/climate_downscale/extract/cmip6.py
+++ b/src/climate_downscale/extract/cmip6.py
@@ -35,7 +35,7 @@ def extract_cmip6_main(
     cmip6_experiment: str,
     cmip6_variable: str,
 ) -> None:
-    print(f'Checking metadata for {cmip6_source} {cmip6_experiment} {cmip6_variable}')
+    print(f"Checking metadata for {cmip6_source} {cmip6_experiment} {cmip6_variable}")
     cd_data = ClimateDownscaleData(output_dir)
     meta = cd_data.load_cmip6_metadata()
 
@@ -47,16 +47,16 @@ def extract_cmip6_main(
     )
 
     meta_subset = meta[mask].set_index("member_id").zstore.to_dict()
-    print(f'Extracting {len(meta_subset)} members...')
+    print(f"Extracting {len(meta_subset)} members...")
 
     for member, zstore_path in meta_subset.items():
-        print('Extracting', member, zstore_path)
+        print("Extracting", member, zstore_path)
         cmip_data = load_cmip_data(zstore_path)
-        out_filename = f"{cmip6_source}_{cmip6_experiment}_{cmip6_variable}_{member}.nc"
+        out_filename = f"{cmip6_variable}_{cmip6_experiment}_{cmip6_source}_{member}.nc"
         out_path = cd_data.cmip6 / out_filename
         shell_tools.touch(out_path, exist_ok=True)
         shift, scale = VARIABLE_ENCODINGS[cmip6_variable]
-        print('Writing to', out_path)
+        print("Writing to", out_path)
         cmip_data.to_netcdf(
             out_path,
             encoding={

From 1e8da840a39a7d53470f3bc0d775948ddbcc93d9 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Thu, 13 Jun 2024 15:15:48 -0700
Subject: [PATCH 32/71] Update runtime

---
 src/climate_downscale/extract/cmip6.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/climate_downscale/extract/cmip6.py b/src/climate_downscale/extract/cmip6.py
index 59c6c96..eeb8863 100644
--- a/src/climate_downscale/extract/cmip6.py
+++ b/src/climate_downscale/extract/cmip6.py
@@ -128,7 +128,7 @@ def extract_cmip6(
             "queue": queue,
             "cores": 1,
             "memory": "10G",
-            "runtime": "120m",
+            "runtime": "600m",
             "project": "proj_rapidresponse",
         },
         max_attempts=1,

From b84ef0dd43ab958c382f49d057ed1793761b3608 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Thu, 13 Jun 2024 18:10:12 -0700
Subject: [PATCH 33/71] Do some reorg

---
 .../{model => downscale}/__init__.py          |   4 +-
 .../prepare_predictors.py                     |   2 +-
 .../prepare_training_data.py                  |   4 +-
 src/climate_downscale/extract/elevation.py    |   2 +-
 src/climate_downscale/generate/__init__.py    |   0
 .../era5_daily.py}                            | 127 ++++++++++--------
 src/climate_downscale/generate/utils.py       |   0
 7 files changed, 80 insertions(+), 59 deletions(-)
 rename src/climate_downscale/{model => downscale}/__init__.py (74%)
 rename src/climate_downscale/{model => downscale}/prepare_predictors.py (98%)
 rename src/climate_downscale/{model => downscale}/prepare_training_data.py (97%)
 create mode 100644 src/climate_downscale/generate/__init__.py
 rename src/climate_downscale/{model/prepare_era5_daily.py => generate/era5_daily.py} (74%)
 create mode 100644 src/climate_downscale/generate/utils.py

diff --git a/src/climate_downscale/model/__init__.py b/src/climate_downscale/downscale/__init__.py
similarity index 74%
rename from src/climate_downscale/model/__init__.py
rename to src/climate_downscale/downscale/__init__.py
index 5449577..24280ec 100644
--- a/src/climate_downscale/model/__init__.py
+++ b/src/climate_downscale/downscale/__init__.py
@@ -1,8 +1,8 @@
-from climate_downscale.model.prepare_predictors import (
+from climate_downscale.downscale.prepare_predictors import (
     prepare_predictors,
     prepare_predictors_task,
 )
-from climate_downscale.model.prepare_training_data import (
+from climate_downscale.downscale.prepare_training_data import (
     prepare_training_data,
     prepare_training_data_task,
 )
diff --git a/src/climate_downscale/model/prepare_predictors.py b/src/climate_downscale/downscale/prepare_predictors.py
similarity index 98%
rename from src/climate_downscale/model/prepare_predictors.py
rename to src/climate_downscale/downscale/prepare_predictors.py
index c858c90..e958bf8 100644
--- a/src/climate_downscale/model/prepare_predictors.py
+++ b/src/climate_downscale/downscale/prepare_predictors.py
@@ -124,7 +124,7 @@ def prepare_predictors_task(
 def prepare_predictors(output_dir: str, queue: str) -> None:
     jobmon.run_parallel(
         runner="cdtask",
-        task_name="model prepare_predictors",
+        task_name="downscale prepare_predictors",
         node_args={
             "lat-start": clio.LATITUDES,
             "lon-start": clio.LONGITUDES,
diff --git a/src/climate_downscale/model/prepare_training_data.py b/src/climate_downscale/downscale/prepare_training_data.py
similarity index 97%
rename from src/climate_downscale/model/prepare_training_data.py
rename to src/climate_downscale/downscale/prepare_training_data.py
index 755ea08..807b0c3 100644
--- a/src/climate_downscale/model/prepare_training_data.py
+++ b/src/climate_downscale/downscale/prepare_training_data.py
@@ -36,7 +36,7 @@ def load_and_clean_climate_stations(
             temperature=lambda df: 5 / 9 * (df["temperature"] - 32),
         )
     )
-    return climate_stations  # noqa: RET504
+    return climate_stations
 
 
 def get_era5_temperature(
@@ -113,7 +113,7 @@ def prepare_training_data_task(output_dir: str, year: str) -> None:
 def prepare_training_data(output_dir: str, queue: str) -> None:
     jobmon.run_parallel(
         runner="cdtask",
-        task_name="model prepare_training_data",
+        task_name="downscale prepare_training_data",
         node_args={
             "year": clio.VALID_YEARS,
         },
diff --git a/src/climate_downscale/extract/elevation.py b/src/climate_downscale/extract/elevation.py
index c12b294..1f66d82 100644
--- a/src/climate_downscale/extract/elevation.py
+++ b/src/climate_downscale/extract/elevation.py
@@ -94,7 +94,7 @@ def extract_elevation_task(
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
 @click.option(
-    "--model-name",
+    "--generate-name",
     required=True,
     type=click.Choice(ELEVATION_MODELS),
     help="Name of the elevation model to download.",
diff --git a/src/climate_downscale/generate/__init__.py b/src/climate_downscale/generate/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/climate_downscale/model/prepare_era5_daily.py b/src/climate_downscale/generate/era5_daily.py
similarity index 74%
rename from src/climate_downscale/model/prepare_era5_daily.py
rename to src/climate_downscale/generate/era5_daily.py
index a33b4fc..a5f34e0 100644
--- a/src/climate_downscale/model/prepare_era5_daily.py
+++ b/src/climate_downscale/generate/era5_daily.py
@@ -1,18 +1,24 @@
-import pandas as pd
-import xarray as xr
 from pathlib import Path
-import numpy as np
 
+import numpy as np
+import pandas as pd
+import xarray as xr
 
-TARGET_LON = xr.DataArray(np.round(np.arange(0., 360., 0.1, dtype='float32'), 1), dims='longitude')
-TARGET_LAT = xr.DataArray(np.round(np.arange(90., -90.1, -0.1, dtype='float32'), 1), dims='latitude')
+TARGET_LON = xr.DataArray(
+    np.round(np.arange(0.0, 360.0, 0.1, dtype="float32"), 1), dims="longitude"
+)
+TARGET_LAT = xr.DataArray(
+    np.round(np.arange(90.0, -90.1, -0.1, dtype="float32"), 1), dims="latitude"
+)
 
 
 def kelvin_to_celsius(temperature_k):
     return temperature_k - 273.15
 
+
 def m_to_mm(ds):
-    return 1000*ds
+    return 1000 * ds
+
 
 def scale_windspeed(windspeed):
     """Scaling wind speed from a height of 10 meters to a height of 2 meters
@@ -33,19 +39,21 @@ def scale_windspeed(windspeed):
     scale_factor = np.log10(2 / 0.01) / np.log10(10 / 0.01)
     return scale_factor * windspeed
 
+
 def identity(ds):
     return ds
 
+
 def rename_val_column(ds):
     data_var = next(iter(ds))
     return ds.rename({data_var: "value"})
-    
+
 
 convert_map = {
     "10m_u_component_of_wind": scale_windspeed,
     "10m_v_component_of_wind": scale_windspeed,
     "2m_dewpoint_temperature": kelvin_to_celsius,
-    "2m_temperature": kelvin_to_celsius,    
+    "2m_temperature": kelvin_to_celsius,
     "surface_net_solar_radiation": identity,
     "surface_net_thermal_radiation": identity,
     "surface_pressure": identity,
@@ -55,49 +63,56 @@ def rename_val_column(ds):
     "total_sky_direct_solar_radiation_at_surface": identity,
 }
 
+
 def interpolate_to_target(ds):
-    return (
-        ds
-        .interp(longitude=TARGET_LON, latitude=TARGET_LAT, method='nearest')
-        .interpolate_na(dim="longitude", method="nearest", fill_value="extrapolate")
-    )
+    return ds.interp(
+        longitude=TARGET_LON, latitude=TARGET_LAT, method="nearest"
+    ).interpolate_na(dim="longitude", method="nearest", fill_value="extrapolate")
+
 
-def load_variable(variable, year, month, dataset='single-levels'):
+def load_variable(variable, year, month, dataset="single-levels"):
     root = Path("/mnt/share/erf/climate_downscale/extracted_data/era5")
     p = root / f"reanalysis-era5-{dataset}_{variable}_{year}_{month}.nc"
-    if dataset == 'land' and not p.exists():
+    if dataset == "land" and not p.exists():
         # Substitute the single level dataset pre-interpolated at the target resolution.
         p = root / f"reanalysis-era5-single-levels_{source_variable}_{year}_{month}.nc"
         ds = interpolate_to_target(xr.load_dataset(p))
-    elif dataset == 'land':
+    elif dataset == "land":
         ds = xr.load_dataset(p).assign_coords(latitude=TARGET_LAT, longitude=TARGET_LON)
     else:
         ds = xr.load_dataset(p)
     conversion = convert_map[variable]
-    ds = conversion(rename_val_column(ds))    
+    ds = conversion(rename_val_column(ds))
     return ds
-        
+
 
 ########
-    
+
+
 def daily_mean(ds):
-    return ds.groupby('time.date').mean()
+    return ds.groupby("time.date").mean()
+
 
 def daily_max(ds):
-    return ds.groupby('time.date').max()
+    return ds.groupby("time.date").max()
+
 
 def daily_min(ds):
-    return ds.groupby('time.date').min()
+    return ds.groupby("time.date").min()
+
 
 def daily_sum(ds):
-    return ds.groupby('time.date').sum()
+    return ds.groupby("time.date").sum()
+
 
 def cdd(temperature_c):
     return np.maximum(temperature_c - 18, 0).groupby("time.date").mean()
 
+
 def hdd(temperature_c):
     return np.maximum(18 - temperature_c, 0).groupby("time.date").mean()
 
+
 def vector_magnitude(x, y):
     return np.sqrt(x**2 + y**2)
 
@@ -108,10 +123,15 @@ def buck_vapor_presure(temperature_c):
     https://en.wikipedia.org/wiki/Arden_Buck_equation
     https://journals.ametsoc.org/view/journals/apme/20/12/1520-0450_1981_020_1527_nefcvp_2_0_co_2.xml
     """
-    over_water = 6.1121 * np.exp((18.678 - temperature_c / 234.5) * (temperature_c / (257.14 + temperature_c)))
-    over_ice = 6.1115 * np.exp((23.036 - temperature_c / 333.7) * (temperature_c / (279.82 + temperature_c)))
+    over_water = 6.1121 * np.exp(
+        (18.678 - temperature_c / 234.5) * (temperature_c / (257.14 + temperature_c))
+    )
+    over_ice = 6.1115 * np.exp(
+        (23.036 - temperature_c / 333.7) * (temperature_c / (279.82 + temperature_c))
+    )
     return xr.where(temperature_c > 0, over_water, over_ice)
 
+
 def rh_percent(temperature_c, dewpoint_temperature_c):
     # saturated vapour pressure
     es = buck_vapor_pressure(temperature_c)
@@ -120,10 +140,11 @@ def rh_percent(temperature_c, dewpoint_temperature_c):
     rh = (e / es) * 100
     return rh
 
+
 def heat_index(temperature_c, dewpoint_temperature_c):
     t = temperature_c  # Alias for simplicity in the formula
     r = rh_percent(temperature_c, dewpoint_temperature_c)
-    
+
     hi_raw = (
         -8.784695
         + 1.61139411 * t
@@ -138,15 +159,17 @@ def heat_index(temperature_c, dewpoint_temperature_c):
     hi = xr.where(t > 20, hi_raw, t)
     return hi
 
+
 def humidex(temperature_c, dewpoint_temperature_c):
     vp = buck_vapor_pressure(dewpoint_temperature_c)
     return temperature_c + 0.5555 * (vp - 10)
 
+
 def effective_temperature(temperature_c, dewpoint_temperature_c, uas, vas):
     """https://www.sciencedirect.com/topics/engineering/effective-temperature"""
     t = temperature_c
     r = rh_percent(temperature_c, dewpoint_temperature_c)
-    v = vector_magnitude(uas, vas)    
+    v = vector_magnitude(uas, vas)
 
     wind_adjustment = 1 / (1.76 + 1.4 * v**0.75)
     et = (
@@ -155,22 +178,23 @@ def effective_temperature(temperature_c, dewpoint_temperature_c, uas, vas):
         - 0.29 * t * (1 - 0.01 * r)
     )
     return et
-    
-    
-    
 
 
 collapse_map = {
     "mean_temperature": (["2m_temperature"], daily_mean, (273.15, 0.01)),
     "max_temperature": (["2m_temperature"], daily_max, (273.15, 0.01)),
-    "min_temperature": (["2m_temperature"], daily_min, (273.15, 0.01)),    
+    "min_temperature": (["2m_temperature"], daily_min, (273.15, 0.01)),
     "cooling_degree_days": (["2m_temperature"], cdd, (0, 0.01)),
     "heating_degree_days": (["2m_temperature"], hdd, (0, 0.01)),
     "wind_speed": (
-        ["10m_u_component_of_wind", "10m_v_component_of_wind"], lambda x, y: daily_mean(vector_magnitude(x, y)), (0, 0.01)
+        ["10m_u_component_of_wind", "10m_v_component_of_wind"],
+        lambda x, y: daily_mean(vector_magnitude(x, y)),
+        (0, 0.01),
     ),
     "relative_humidity": (
-        ["2m_temperature", "2m_dewpoint_temperature"], lambda x, y: daily_mean(rh_percent(x, y)), (0, 0.01)
+        ["2m_temperature", "2m_dewpoint_temperature"],
+        lambda x, y: daily_mean(rh_percent(x, y)),
+        (0, 0.01),
     ),
     "total_precipitation": (["total_precipitation"], daily_sum, (0, 0.1)),
     # "heat_index": (
@@ -183,7 +207,6 @@ def effective_temperature(temperature_c, dewpoint_temperature_c, uas, vas):
     #     ["2m_temperature", "2m_dewpoint_temperature", "10m_u_component_of_wind", "10m_v_component_of_wind"],
     #     lambda *args: daily_mean(effective_temperature(*args)), (273.15, 0.01)
     # ),
-    
 }
 
 year = "1990"
@@ -194,36 +217,34 @@ def effective_temperature(temperature_c, dewpoint_temperature_c, uas, vas):
 
 print("loading single-levels")
 single_level = [
-    load_variable(sv, year, month, 'single-levels') for sv in source_variables
+    load_variable(sv, year, month, "single-levels") for sv in source_variables
 ]
-print('collapsing')
+print("collapsing")
 ds = collapse_fun(*single_level)
 ds = ds.assign(date=pd.to_datetime(ds.date))
 
-print('interpolating')
+print("interpolating")
 ds_land_res = interpolate_to_target(ds)
 
 print("loading land")
-land = [
-    load_variable(sv, year, month, 'land') for sv in source_variables
-]    
-print('collapsing')
+land = [load_variable(sv, year, month, "land") for sv in source_variables]
+print("collapsing")
 ds_land = collapse_fun(*land)
 ds_land = ds_land.assign(date=pd.to_datetime(ds_land.date))
 
-print('combining')
+print("combining")
 combined = ds_land.combine_first(ds_land_res)
 
 combined.to_netcdf(
-    'compressed.nc', 
+    "compressed.nc",
     encoding={
-        'value': {
-            'dtype': 'int16', 
-            'add_offset': e_offset,
-            'scale_factor': e_scale, 
-            '_FillValue': -9999,
-            'zlib': True,
-            'complevel': 1,
-        }        
-    }
-)
\ No newline at end of file
+        "value": {
+            "dtype": "int16",
+            "add_offset": e_offset,
+            "scale_factor": e_scale,
+            "_FillValue": -9999,
+            "zlib": True,
+            "complevel": 1,
+        }
+    },
+)
diff --git a/src/climate_downscale/generate/utils.py b/src/climate_downscale/generate/utils.py
new file mode 100644
index 0000000..e69de29

From c2a4520d83f2c3a41bbec2e1ff17e449799043bb Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Thu, 13 Jun 2024 21:20:18 -0700
Subject: [PATCH 34/71] Put together era5 daily script

---
 src/climate_downscale/cli.py                 |   4 +-
 src/climate_downscale/cli_options.py         |   5 +-
 src/climate_downscale/data.py                |  34 +-
 src/climate_downscale/generate/__init__.py   |  12 +
 src/climate_downscale/generate/era5_daily.py | 426 +++++++++----------
 src/climate_downscale/generate/utils.py      | 277 ++++++++++++
 6 files changed, 538 insertions(+), 220 deletions(-)

diff --git a/src/climate_downscale/cli.py b/src/climate_downscale/cli.py
index b962d54..38f86d3 100644
--- a/src/climate_downscale/cli.py
+++ b/src/climate_downscale/cli.py
@@ -1,6 +1,6 @@
 import click
 
-from climate_downscale import extract, model
+from climate_downscale import downscale, extract, generate
 
 
 @click.group()
@@ -13,7 +13,7 @@ def cdtask() -> None:
     """Entry point for running climate downscale tasks."""
 
 
-for module in [extract, model]:
+for module in [extract, downscale, generate]:
     runners = getattr(module, "RUNNERS", {})
     task_runners = getattr(module, "TASK_RUNNERS", {})
 
diff --git a/src/climate_downscale/cli_options.py b/src/climate_downscale/cli_options.py
index 720b53e..879465e 100644
--- a/src/climate_downscale/cli_options.py
+++ b/src/climate_downscale/cli_options.py
@@ -22,13 +22,14 @@
 
 def with_year(
     *,
+    years: list[str] = VALID_YEARS,
     allow_all: bool = False,
 ) -> ClickOption[_P, _T]:
     return with_choice(
         "year",
         "y",
         allow_all=allow_all,
-        choices=VALID_YEARS,
+        choices=years,
         help="Year to extract data for.",
     )
 
@@ -219,4 +220,6 @@ def with_lon_start(
     "with_num_cores",
     "with_progress_bar",
     "RUN_ALL",
+    "ClickOption",
+    "with_choice",
 ]
diff --git a/src/climate_downscale/data.py b/src/climate_downscale/data.py
index 03548c8..839448f 100644
--- a/src/climate_downscale/data.py
+++ b/src/climate_downscale/data.py
@@ -87,12 +87,12 @@ def rub_local_climate_zones(self) -> Path:
         return self.extracted_data / "rub_local_climate_zones"
 
     @property
-    def model(self) -> Path:
-        return self.root / "model"
+    def downscale_model(self) -> Path:
+        return self.root / "downscale_model"
 
     @property
     def predictors(self) -> Path:
-        return self.model / "predictors"
+        return self.downscale_model / "predictors"
 
     def save_predictor(
         self,
@@ -109,7 +109,7 @@ def load_predictor(self, name: str) -> rt.RasterArray:
 
     @property
     def training_data(self) -> Path:
-        return self.model / "training_data"
+        return self.downscale_model / "training_data"
 
     def save_training_data(self, df: pd.DataFrame, year: int | str) -> None:
         path = self.training_data / f"{year}.parquet"
@@ -119,6 +119,32 @@ def save_training_data(self, df: pd.DataFrame, year: int | str) -> None:
     def load_training_data(self, year: int | str) -> pd.DataFrame:
         return pd.read_parquet(self.training_data / f"{year}.parquet")
 
+    @property
+    def results(self) -> Path:
+        return self.root / "results"
+
+    @property
+    def era5_daily(self) -> Path:
+        return self.results / "era5_daily"
+
+    def save_era5_daily(
+        self,
+        ds: xr.Dataset,
+        variable: str,
+        year: int | str,
+        **encoding_kwargs: Any,
+    ) -> None:
+        encoding = {
+            "dtype": "int16",
+            "_FillValue": -32767,
+            "zlib": True,
+            "complevel": 1,
+        }
+        encoding.update(encoding_kwargs)
+        path = self.era5_daily / f"{variable}_{year}.nc"
+        touch(path, exist_ok=True)
+        ds.to_netcdf(path, encoding={"value": encoding})
+
 
 def save_raster(
     raster: rt.RasterArray,
diff --git a/src/climate_downscale/generate/__init__.py b/src/climate_downscale/generate/__init__.py
index e69de29..21710f6 100644
--- a/src/climate_downscale/generate/__init__.py
+++ b/src/climate_downscale/generate/__init__.py
@@ -0,0 +1,12 @@
+from climate_downscale.generate.era5_daily import (
+    generate_era5_daily,
+    generate_era5_daily_task,
+)
+
+RUNNERS = {
+    "era5_daily": generate_era5_daily,
+}
+
+TASK_RUNNERS = {
+    "era5_daily": generate_era5_daily_task,
+}
diff --git a/src/climate_downscale/generate/era5_daily.py b/src/climate_downscale/generate/era5_daily.py
index a5f34e0..6a7a45d 100644
--- a/src/climate_downscale/generate/era5_daily.py
+++ b/src/climate_downscale/generate/era5_daily.py
@@ -1,250 +1,250 @@
+import typing
 from pathlib import Path
 
+import click
 import numpy as np
 import pandas as pd
 import xarray as xr
+from rra_tools import jobmon
+
+from climate_downscale import cli_options as clio
+from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
+from climate_downscale.generate import utils
 
 TARGET_LON = xr.DataArray(
-    np.round(np.arange(0.0, 360.0, 0.1, dtype="float32"), 1), dims="longitude"
+    np.round(np.arange(-180.0, 180.0, 0.1, dtype="float32"), 1), dims="longitude"
 )
 TARGET_LAT = xr.DataArray(
     np.round(np.arange(90.0, -90.1, -0.1, dtype="float32"), 1), dims="latitude"
 )
 
+# Map from source variable to a unit conversion function
+CONVERT_MAP = {
+    "10m_u_component_of_wind": utils.scale_wind_speed_height,
+    "10m_v_component_of_wind": utils.scale_wind_speed_height,
+    "2m_dewpoint_temperature": utils.kelvin_to_celsius,
+    "2m_temperature": utils.kelvin_to_celsius,
+    "surface_net_solar_radiation": utils.identity,
+    "surface_net_thermal_radiation": utils.identity,
+    "surface_pressure": utils.identity,
+    "surface_solar_radiation_downwards": utils.identity,
+    "surface_thermal_radiation_downwards": utils.identity,
+    "total_precipitation": utils.meter_to_millimeter,
+    "total_sky_direct_solar_radiation_at_surface": utils.identity,
+}
 
-def kelvin_to_celsius(temperature_k):
-    return temperature_k - 273.15
-
-
-def m_to_mm(ds):
-    return 1000 * ds
+# Map from target variable to:
+#  - a list of source variables
+#  - a transformation function
+#  - a tuple of offset and scale factors for the output for serialization
+TRANSFORM_MAP = {
+    "mean_temperature": (
+        ["2m_temperature"],
+        utils.daily_mean,
+        (273.15, 0.01),
+    ),
+    "max_temperature": (
+        ["2m_temperature"],
+        utils.daily_max,
+        (273.15, 0.01),
+    ),
+    "min_temperature": (
+        ["2m_temperature"],
+        utils.daily_min,
+        (273.15, 0.01),
+    ),
+    "cooling_degree_days": (
+        ["2m_temperature"],
+        utils.cdd,
+        (0, 0.01),
+    ),
+    "heating_degree_days": (
+        ["2m_temperature"],
+        utils.hdd,
+        (0, 0.01),
+    ),
+    "wind_speed": (
+        ["10m_u_component_of_wind", "10m_v_component_of_wind"],
+        lambda x, y: utils.daily_mean(utils.vector_magnitude(x, y)),
+        (0, 0.01),
+    ),
+    "relative_humidity": (
+        ["2m_temperature", "2m_dewpoint_temperature"],
+        lambda x, y: utils.daily_mean(utils.rh_percent(x, y)),
+        (0, 0.01),
+    ),
+    "total_precipitation": (
+        ["total_precipitation"],
+        utils.daily_sum,
+        (0, 0.1),
+    ),
+}
 
+UNTESTED_TRANSFORM_MAP = {
+    "heat_index": (
+        ["2m_temperature", "2m_dewpoint_temperature"],
+        lambda x, y: utils.daily_mean(utils.heat_index(x, y)),
+        (273.15, 0.01),
+    ),
+    "humidex": (
+        ["2m_temperature", "2m_dewpoint_temperature"],
+        lambda x, y: utils.daily_mean(utils.humidex(x, y)),
+        (273.15, 0.01),
+    ),
+    "effective_temperature": (
+        [
+            "2m_temperature",
+            "2m_dewpoint_temperature",
+            "10m_u_component_of_wind",
+            "10m_v_component_of_wind",
+        ],
+        lambda t2m, t2d, uas, vas: utils.daily_mean(
+            utils.effective_temperature(t2m, t2d, uas, vas)
+        ),
+        (273.15, 0.01),
+    ),
+}
 
-def scale_windspeed(windspeed):
-    """Scaling wind speed from a height of 10 meters to a height of 2 meters
 
-    Reference: Bröde et al. (2012)
-    https://doi.org/10.1007/s00484-011-0454-1
+_P = typing.ParamSpec("_P")
+_T = typing.TypeVar("_T")
 
-    Parameters
-    ----------
-    ds
-        The 10m wind speed [m/s]. May be signed (ie a velocity component)
 
-    Returnds
-    --------
-    xr.DataSet
-        The 2m wind speed [m/s]. May be signed (ie a velocity component)
-    """
-    scale_factor = np.log10(2 / 0.01) / np.log10(10 / 0.01)
-    return scale_factor * windspeed
+def with_variable(
+    *,
+    allow_all: bool = False,
+) -> clio.ClickOption[_P, _T]:
+    return clio.with_choice(
+        "target-variable",
+        "t",
+        allow_all=allow_all,
+        choices=list(TRANSFORM_MAP.keys()),
+        help="Variable to generate.",
+    )
 
 
-def identity(ds):
+def load_and_shift_longitude(ds_path: str | Path) -> xr.Dataset:
+    ds = xr.load_dataset(ds_path)
+    ds = ds.assign_coords(longitude=(ds.longitude + 180) % 360 - 180).sortby(
+        "longitude"
+    )
     return ds
 
 
-def rename_val_column(ds):
-    data_var = next(iter(ds))
-    return ds.rename({data_var: "value"})
-
-
-convert_map = {
-    "10m_u_component_of_wind": scale_windspeed,
-    "10m_v_component_of_wind": scale_windspeed,
-    "2m_dewpoint_temperature": kelvin_to_celsius,
-    "2m_temperature": kelvin_to_celsius,
-    "surface_net_solar_radiation": identity,
-    "surface_net_thermal_radiation": identity,
-    "surface_pressure": identity,
-    "surface_solar_radiation_downwards": identity,
-    "surface_thermal_radiation_downwards": identity,
-    "total_precipitation": m_to_mm,
-    "total_sky_direct_solar_radiation_at_surface": identity,
-}
-
-
-def interpolate_to_target(ds):
-    return ds.interp(
-        longitude=TARGET_LON, latitude=TARGET_LAT, method="nearest"
-    ).interpolate_na(dim="longitude", method="nearest", fill_value="extrapolate")
-
-
-def load_variable(variable, year, month, dataset="single-levels"):
+def load_variable(
+    variable: str,
+    year: str,
+    month: str,
+    dataset: str = "single-levels",
+) -> xr.Dataset:
     root = Path("/mnt/share/erf/climate_downscale/extracted_data/era5")
     p = root / f"reanalysis-era5-{dataset}_{variable}_{year}_{month}.nc"
     if dataset == "land" and not p.exists():
         # Substitute the single level dataset pre-interpolated at the target resolution.
-        p = root / f"reanalysis-era5-single-levels_{source_variable}_{year}_{month}.nc"
-        ds = interpolate_to_target(xr.load_dataset(p))
+        p = root / f"reanalysis-era5-single-levels_{variable}_{year}_{month}.nc"
+        ds = utils.interpolate_to_target_latlon(
+            load_and_shift_longitude(p),
+            target_lat=TARGET_LAT,
+            target_lon=TARGET_LON,
+        )
     elif dataset == "land":
-        ds = xr.load_dataset(p).assign_coords(latitude=TARGET_LAT, longitude=TARGET_LON)
+        ds = load_and_shift_longitude(p).assign_coords(
+            latitude=TARGET_LAT, longitude=TARGET_LON
+        )
     else:
-        ds = xr.load_dataset(p)
-    conversion = convert_map[variable]
-    ds = conversion(rename_val_column(ds))
+        ds = load_and_shift_longitude(p)
+    conversion = CONVERT_MAP[variable]
+    ds = conversion(utils.rename_val_column(ds))
     return ds
 
 
-########
-
-
-def daily_mean(ds):
-    return ds.groupby("time.date").mean()
-
-
-def daily_max(ds):
-    return ds.groupby("time.date").max()
-
-
-def daily_min(ds):
-    return ds.groupby("time.date").min()
-
-
-def daily_sum(ds):
-    return ds.groupby("time.date").sum()
-
-
-def cdd(temperature_c):
-    return np.maximum(temperature_c - 18, 0).groupby("time.date").mean()
-
-
-def hdd(temperature_c):
-    return np.maximum(18 - temperature_c, 0).groupby("time.date").mean()
-
-
-def vector_magnitude(x, y):
-    return np.sqrt(x**2 + y**2)
-
-
-def buck_vapor_presure(temperature_c):
-    """Approximate vapor pressure of water.
-
-    https://en.wikipedia.org/wiki/Arden_Buck_equation
-    https://journals.ametsoc.org/view/journals/apme/20/12/1520-0450_1981_020_1527_nefcvp_2_0_co_2.xml
-    """
-    over_water = 6.1121 * np.exp(
-        (18.678 - temperature_c / 234.5) * (temperature_c / (257.14 + temperature_c))
-    )
-    over_ice = 6.1115 * np.exp(
-        (23.036 - temperature_c / 333.7) * (temperature_c / (279.82 + temperature_c))
+def generate_era5_daily_main(
+    output_dir: str | Path,
+    year: str,
+    target_variable: str,
+) -> None:
+    source_variables, collapse_fun, (e_offset, e_scale) = TRANSFORM_MAP[target_variable]
+
+    datasets = []
+    for month in range(1, 13):
+        month_str = f"{month:02d}"
+        print("loading single-levels")
+        single_level = [
+            load_variable(sv, year, month_str, "single-levels")
+            for sv in source_variables
+        ]
+        print("collapsing")
+        ds = collapse_fun(*single_level)  # type: ignore[operator]
+        # collapsing often screws the date dtype, so fix it
+        ds = ds.assign(date=pd.to_datetime(ds.date))
+
+        print("interpolating")
+        ds_land_res = utils.interpolate_to_target_latlon(ds, TARGET_LAT, TARGET_LON)
+
+        print("loading land")
+        land = [load_variable(sv, year, month_str, "land") for sv in source_variables]
+        print("collapsing")
+        ds_land = collapse_fun(*land)  # type: ignore[operator]
+        ds_land = ds_land.assign(date=pd.to_datetime(ds_land.date))
+
+        print("combining")
+        combined = ds_land.combine_first(ds_land_res)
+        datasets.append(combined)
+
+    ds_year = xr.concat(datasets, dim="date").sortby("date")
+
+    cd_data = ClimateDownscaleData(output_dir)
+    cd_data.save_era5_daily(
+        ds_year, target_variable, year, add_offset=e_offset, scale_factor=e_scale
     )
-    return xr.where(temperature_c > 0, over_water, over_ice)
-
-
-def rh_percent(temperature_c, dewpoint_temperature_c):
-    # saturated vapour pressure
-    es = buck_vapor_pressure(temperature_c)
-    # vapour pressure
-    e = buck_vapor_pressure(dewpoint_temperature_c)
-    rh = (e / es) * 100
-    return rh
-
-
-def heat_index(temperature_c, dewpoint_temperature_c):
-    t = temperature_c  # Alias for simplicity in the formula
-    r = rh_percent(temperature_c, dewpoint_temperature_c)
-
-    hi_raw = (
-        -8.784695
-        + 1.61139411 * t
-        + 2.338549 * r
-        - 0.14611605 * t * r
-        - 1.2308094e-2 * t**2
-        - 1.6424828e-2 * r**2
-        + 2.211732e-3 * t**2 * r
-        + 7.2546e-4 * t * r**2
-        - 3.582e-6 * t**2 * r**2
-    )
-    hi = xr.where(t > 20, hi_raw, t)
-    return hi
-
-
-def humidex(temperature_c, dewpoint_temperature_c):
-    vp = buck_vapor_pressure(dewpoint_temperature_c)
-    return temperature_c + 0.5555 * (vp - 10)
 
 
-def effective_temperature(temperature_c, dewpoint_temperature_c, uas, vas):
-    """https://www.sciencedirect.com/topics/engineering/effective-temperature"""
-    t = temperature_c
-    r = rh_percent(temperature_c, dewpoint_temperature_c)
-    v = vector_magnitude(uas, vas)
-
-    wind_adjustment = 1 / (1.76 + 1.4 * v**0.75)
-    et = (
-        37
-        - ((37 - t) / (0.68 - 0.0014 * r + wind_adjustment))
-        - 0.29 * t * (1 - 0.01 * r)
+@click.command()  # type: ignore[arg-type]
+@clio.with_output_directory(DEFAULT_ROOT)
+@clio.with_year()
+@with_variable()
+def generate_era5_daily_task(
+    output_dir: str,
+    year: str,
+    target_variable: str,
+) -> None:
+    generate_era5_daily_main(output_dir, year, target_variable)
+
+
+@click.command()  # type: ignore[arg-type]
+@clio.with_output_directory(DEFAULT_ROOT)
+@clio.with_year(allow_all=True)
+@with_variable(allow_all=True)
+@clio.with_queue()
+def generate_era5_daily(
+    output_dir: str,
+    year: str,
+    target_variable: str,
+    queue: str,
+) -> None:
+    years = clio.VALID_YEARS if year == clio.RUN_ALL else [year]
+    variables = (
+        list(TRANSFORM_MAP.keys())
+        if target_variable == clio.RUN_ALL
+        else [target_variable]
     )
-    return et
-
 
-collapse_map = {
-    "mean_temperature": (["2m_temperature"], daily_mean, (273.15, 0.01)),
-    "max_temperature": (["2m_temperature"], daily_max, (273.15, 0.01)),
-    "min_temperature": (["2m_temperature"], daily_min, (273.15, 0.01)),
-    "cooling_degree_days": (["2m_temperature"], cdd, (0, 0.01)),
-    "heating_degree_days": (["2m_temperature"], hdd, (0, 0.01)),
-    "wind_speed": (
-        ["10m_u_component_of_wind", "10m_v_component_of_wind"],
-        lambda x, y: daily_mean(vector_magnitude(x, y)),
-        (0, 0.01),
-    ),
-    "relative_humidity": (
-        ["2m_temperature", "2m_dewpoint_temperature"],
-        lambda x, y: daily_mean(rh_percent(x, y)),
-        (0, 0.01),
-    ),
-    "total_precipitation": (["total_precipitation"], daily_sum, (0, 0.1)),
-    # "heat_index": (
-    #     ["2m_temperature", "2m_dewpoint_temperature"], lambda x, y: daily_mean(heat_index(x, y)), (273.15, 0.01)
-    # ),
-    # "humidex": (
-    #     ['2m_temperature', '2m_dewpoint_temperature'], lambda x, y: daily_mean(humidex(x, y)), (273.15, 0.01)
-    # ),
-    # "normal_effective_temperature": (
-    #     ["2m_temperature", "2m_dewpoint_temperature", "10m_u_component_of_wind", "10m_v_component_of_wind"],
-    #     lambda *args: daily_mean(effective_temperature(*args)), (273.15, 0.01)
-    # ),
-}
-
-year = "1990"
-month = "01"
-target_variable = "wind_speed"
-
-source_variables, collapse_fun, (e_offset, e_scale) = collapse_map[target_variable]
-
-print("loading single-levels")
-single_level = [
-    load_variable(sv, year, month, "single-levels") for sv in source_variables
-]
-print("collapsing")
-ds = collapse_fun(*single_level)
-ds = ds.assign(date=pd.to_datetime(ds.date))
-
-print("interpolating")
-ds_land_res = interpolate_to_target(ds)
-
-print("loading land")
-land = [load_variable(sv, year, month, "land") for sv in source_variables]
-print("collapsing")
-ds_land = collapse_fun(*land)
-ds_land = ds_land.assign(date=pd.to_datetime(ds_land.date))
-
-print("combining")
-combined = ds_land.combine_first(ds_land_res)
-
-combined.to_netcdf(
-    "compressed.nc",
-    encoding={
-        "value": {
-            "dtype": "int16",
-            "add_offset": e_offset,
-            "scale_factor": e_scale,
-            "_FillValue": -9999,
-            "zlib": True,
-            "complevel": 1,
-        }
-    },
-)
+    jobmon.run_parallel(
+        runner="cdtask",
+        task_name="extract cmip6",
+        node_args={
+            "year": years,
+            "variable": variables,
+        },
+        task_args={
+            "output-dir": output_dir,
+        },
+        task_resources={
+            "queue": queue,
+            "cores": 1,
+            "memory": "10G",
+            "runtime": "120m",
+            "project": "proj_rapidresponse",
+        },
+        max_attempts=1,
+    )
diff --git a/src/climate_downscale/generate/utils.py b/src/climate_downscale/generate/utils.py
index e69de29..44de6d3 100644
--- a/src/climate_downscale/generate/utils.py
+++ b/src/climate_downscale/generate/utils.py
@@ -0,0 +1,277 @@
+import numpy as np
+import xarray as xr
+
+#############################
+# Standard unit conversions #
+#############################
+
+
+def kelvin_to_celsius(temperature_k: xr.Dataset) -> xr.Dataset:
+    """Convert temperature from Kelvin to Celsius
+
+    Parameters
+    ----------
+    temperature_k
+        Temperature in Kelvin
+
+    Returns
+    -------
+    xr.Dataset
+        Temperature in Celsius
+    """
+    return temperature_k - 273.15
+
+
+def meter_to_millimeter(rainfall_m: xr.Dataset) -> xr.Dataset:
+    """Convert rainfall from meters to millimeters
+
+    Parameters
+    ----------
+    rainfall_m
+        Rainfall in meters
+
+    Returns
+    -------
+    xr.Dataset
+        Rainfall in millimeters
+    """
+    return 1000 * rainfall_m
+
+
+def scale_wind_speed_height(wind_speed_10m: xr.Dataset) -> xr.Dataset:
+    """Scaling wind speed from a height of 10 meters to a height of 2 meters
+
+    Reference: Bröde et al. (2012)
+    https://doi.org/10.1007/s00484-011-0454-1
+
+    Parameters
+    ----------
+    wind_speed_10m
+        The 10m wind speed [m/s]. May be signed (ie a velocity component)
+
+    Returns
+    -------
+    xr.DataSet
+        The 2m wind speed [m/s]. May be signed (ie a velocity component)
+    """
+    scale_factor = np.log10(2 / 0.01) / np.log10(10 / 0.01)
+    return scale_factor * wind_speed_10m  # type: ignore[no-any-return]
+
+
+def identity(ds: xr.Dataset) -> xr.Dataset:
+    """Identity transformation"""
+    return ds
+
+
+######################
+# Standard summaries #
+######################
+
+
+def daily_mean(ds: xr.Dataset) -> xr.Dataset:
+    return ds.groupby("time.date").mean()
+
+
+def daily_max(ds: xr.Dataset) -> xr.Dataset:
+    return ds.groupby("time.date").max()
+
+
+def daily_min(ds: xr.Dataset) -> xr.Dataset:
+    return ds.groupby("time.date").min()
+
+
+def daily_sum(ds: xr.Dataset) -> xr.Dataset:
+    return ds.groupby("time.date").sum()
+
+
+########################
+# Data transformations #
+########################
+
+
+def cdd(temperature_c: xr.Dataset) -> xr.Dataset:
+    """Calculate cooling degree days"""
+    return daily_mean(np.maximum(temperature_c - 18, 0))  # type: ignore[call-overload]
+
+
+def hdd(temperature_c: xr.Dataset) -> xr.Dataset:
+    """Calculate heating degree days"""
+    return daily_mean(np.maximum(18 - temperature_c, 0))  # type: ignore[call-overload]
+
+
+def vector_magnitude(x: xr.Dataset, y: xr.Dataset) -> xr.Dataset:
+    """Calculate the magnitude of a vector."""
+    return np.sqrt(x**2 + y**2)  # type: ignore[no-any-return]
+
+
+def buck_vapor_pressure(temperature_c: xr.Dataset) -> xr.Dataset:
+    """Approximate vapor pressure of water.
+
+    https://en.wikipedia.org/wiki/Arden_Buck_equation
+    https://journals.ametsoc.org/view/journals/apme/20/12/1520-0450_1981_020_1527_nefcvp_2_0_co_2.xml
+
+    Parameters
+    ----------
+    temperature_c
+        Temperature in Celsius
+
+    Returns
+    -------
+    xr.Dataset
+        Vapor pressure in hPa
+    """
+    over_water = 6.1121 * np.exp(
+        (18.678 - temperature_c / 234.5) * (temperature_c / (257.14 + temperature_c))
+    )
+    over_ice = 6.1115 * np.exp(
+        (23.036 - temperature_c / 333.7) * (temperature_c / (279.82 + temperature_c))
+    )
+    vp = xr.where(temperature_c > 0, over_water, over_ice)  # type: ignore[no-untyped-call]
+    return vp  # type: ignore[no-any-return]
+
+
+def rh_percent(
+    temperature_c: xr.Dataset, dewpoint_temperature_c: xr.Dataset
+) -> xr.Dataset:
+    """Calculate relative humidity from temperature and dewpoint temperature.
+
+    Parameters
+    ----------
+    temperature_c
+        Temperature in Celsius
+    dewpoint_temperature_c
+        Dewpoint temperature in Celsius
+
+    Returns
+    -------
+    xr.Dataset
+        Relative humidity as a percentage
+    """
+    # saturation vapour pressure
+    svp = buck_vapor_pressure(temperature_c)
+    # actual vapour pressure
+    vp = buck_vapor_pressure(dewpoint_temperature_c)
+    return 100 * vp / svp
+
+
+def heat_index(
+    temperature_c: xr.Dataset, dewpoint_temperature_c: xr.Dataset
+) -> xr.Dataset:
+    """Calculate the heat index.
+
+    https://www.weather.gov/media/ffc/ta_htindx.PDF
+
+    Parameters
+    ----------
+    temperature_c
+        Temperature in Celsius
+    dewpoint_temperature_c
+        Dewpoint temperature in Celsius
+
+    Returns
+    -------
+    xr.Dataset
+        Heat index in Celsius
+    """
+    t = temperature_c  # Alias for simplicity in the formula
+    r = rh_percent(temperature_c, dewpoint_temperature_c)
+
+    # Heat index formula from canonical multi-variable regression
+    hi_raw = (
+        -8.784695
+        + 1.61139411 * t
+        + 2.338549 * r
+        - 0.14611605 * t * r
+        - 1.2308094e-2 * t**2
+        - 1.6424828e-2 * r**2
+        + 2.211732e-3 * t**2 * r
+        + 7.2546e-4 * t * r**2
+        - 3.582e-6 * t**2 * r**2
+    )
+    # Below 20 degrees, the heat index is the same as the temperature
+    hi_threshold = 20
+    hi = xr.where(t > hi_threshold, hi_raw, t)  # type: ignore[no-untyped-call]
+    return hi  # type: ignore[no-any-return]
+
+
+def humidex(
+    temperature_c: xr.Dataset, dewpoint_temperature_c: xr.Dataset
+) -> xr.Dataset:
+    """Calculate the humidex.
+
+    https://en.wikipedia.org/wiki/Humidex
+
+    Parameters
+    ----------
+    temperature_c
+        Temperature in Celsius
+    dewpoint_temperature_c
+        Dewpoint temperature in Celsius
+
+    Returns
+    -------
+    xr.Dataset
+        Humidex in Celsius
+    """
+    vp = buck_vapor_pressure(dewpoint_temperature_c)
+    return temperature_c + 0.5555 * (vp - 10)
+
+
+def effective_temperature(
+    temperature_c: xr.Dataset,
+    dewpoint_temperature_c: xr.Dataset,
+    uas: xr.Dataset,
+    vas: xr.Dataset,
+) -> xr.Dataset:
+    """Calculate the effective temperature.
+
+    https://www.sciencedirect.com/topics/engineering/effective-temperature
+
+    Parameters
+    ----------
+    temperature_c
+        Temperature in Celsius
+    dewpoint_temperature_c
+        Dewpoint temperature in Celsius
+    uas
+        U-component of wind speed
+    vas
+        V-component of wind speed
+
+    Returns
+    -------
+    xr.Dataset
+        Effective temperature in Celsius
+    """
+    # Alias for simplicity in the formula
+    t = temperature_c
+    r = rh_percent(temperature_c, dewpoint_temperature_c)
+    v = vector_magnitude(uas, vas)
+
+    wind_adjustment = 1 / (1.76 + 1.4 * v**0.75)
+    et = (
+        37
+        - ((37 - t) / (0.68 - 0.0014 * r + wind_adjustment))
+        - 0.29 * t * (1 - 0.01 * r)
+    )
+    return et
+
+
+################
+# Data cleanup #
+################
+
+
+def rename_val_column(ds: xr.Dataset) -> xr.Dataset:
+    data_var = next(iter(ds))
+    return ds.rename({data_var: "value"})
+
+
+def interpolate_to_target_latlon(
+    ds: xr.Dataset,
+    target_lat: xr.DataArray,
+    target_lon: xr.DataArray,
+) -> xr.Dataset:
+    return ds.interp(
+        longitude=target_lon, latitude=target_lat, method="nearest"
+    ).interpolate_na(dim="longitude", method="nearest", fill_value="extrapolate")

From 6760a191384bce0c252c905858ba13e8603e3df0 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Thu, 13 Jun 2024 21:23:04 -0700
Subject: [PATCH 35/71] Fix runner

---
 src/climate_downscale/generate/era5_daily.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/climate_downscale/generate/era5_daily.py b/src/climate_downscale/generate/era5_daily.py
index 6a7a45d..0d1f645 100644
--- a/src/climate_downscale/generate/era5_daily.py
+++ b/src/climate_downscale/generate/era5_daily.py
@@ -231,7 +231,7 @@ def generate_era5_daily(
 
     jobmon.run_parallel(
         runner="cdtask",
-        task_name="extract cmip6",
+        task_name="generate era5_daily",
         node_args={
             "year": years,
             "variable": variables,

From 5272118a1ccf2b4592b5acf9e072f33b927d0035 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Thu, 13 Jun 2024 21:26:15 -0700
Subject: [PATCH 36/71] Add month specific logging and shorten range for
 testing

---
 src/climate_downscale/generate/era5_daily.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/climate_downscale/generate/era5_daily.py b/src/climate_downscale/generate/era5_daily.py
index 0d1f645..c85770b 100644
--- a/src/climate_downscale/generate/era5_daily.py
+++ b/src/climate_downscale/generate/era5_daily.py
@@ -166,9 +166,9 @@ def generate_era5_daily_main(
     source_variables, collapse_fun, (e_offset, e_scale) = TRANSFORM_MAP[target_variable]
 
     datasets = []
-    for month in range(1, 13):
+    for month in range(1, 3):
         month_str = f"{month:02d}"
-        print("loading single-levels")
+        print(f"loading single-levels for {month_str}")
         single_level = [
             load_variable(sv, year, month_str, "single-levels")
             for sv in source_variables
@@ -181,7 +181,7 @@ def generate_era5_daily_main(
         print("interpolating")
         ds_land_res = utils.interpolate_to_target_latlon(ds, TARGET_LAT, TARGET_LON)
 
-        print("loading land")
+        print(f"loading land for {month_str}")
         land = [load_variable(sv, year, month_str, "land") for sv in source_variables]
         print("collapsing")
         ds_land = collapse_fun(*land)  # type: ignore[operator]

From c75e03b32a86a9a8268645e6158e52df1671c82f Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Thu, 13 Jun 2024 23:03:21 -0700
Subject: [PATCH 37/71] Be lazier

---
 src/climate_downscale/generate/era5_daily.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/climate_downscale/generate/era5_daily.py b/src/climate_downscale/generate/era5_daily.py
index c85770b..54990cf 100644
--- a/src/climate_downscale/generate/era5_daily.py
+++ b/src/climate_downscale/generate/era5_daily.py
@@ -124,7 +124,7 @@ def with_variable(
 
 
 def load_and_shift_longitude(ds_path: str | Path) -> xr.Dataset:
-    ds = xr.load_dataset(ds_path)
+    ds = xr.open_dataset(ds_path)
     ds = ds.assign_coords(longitude=(ds.longitude + 180) % 360 - 180).sortby(
         "longitude"
     )
@@ -140,6 +140,7 @@ def load_variable(
     root = Path("/mnt/share/erf/climate_downscale/extracted_data/era5")
     p = root / f"reanalysis-era5-{dataset}_{variable}_{year}_{month}.nc"
     if dataset == "land" and not p.exists():
+        raise NotImplementedError
         # Substitute the single level dataset pre-interpolated at the target resolution.
         p = root / f"reanalysis-era5-single-levels_{variable}_{year}_{month}.nc"
         ds = utils.interpolate_to_target_latlon(
@@ -174,7 +175,7 @@ def generate_era5_daily_main(
             for sv in source_variables
         ]
         print("collapsing")
-        ds = collapse_fun(*single_level)  # type: ignore[operator]
+        ds = collapse_fun(*single_level).compute()  # type: ignore[operator]
         # collapsing often screws the date dtype, so fix it
         ds = ds.assign(date=pd.to_datetime(ds.date))
 
@@ -184,7 +185,7 @@ def generate_era5_daily_main(
         print(f"loading land for {month_str}")
         land = [load_variable(sv, year, month_str, "land") for sv in source_variables]
         print("collapsing")
-        ds_land = collapse_fun(*land)  # type: ignore[operator]
+        ds_land = collapse_fun(*land).compute()  # type: ignore[operator]
         ds_land = ds_land.assign(date=pd.to_datetime(ds_land.date))
 
         print("combining")

From 0b52a39154aa1df98894b40f1f8340652bb844d0 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Fri, 14 Jun 2024 00:25:53 -0700
Subject: [PATCH 38/71] Add cmip daily

---
 src/climate_downscale/generate/cmip_daily.py | 43 ++++++++++++++++++++
 src/climate_downscale/generate/era5_daily.py | 30 +++++++-------
 2 files changed, 58 insertions(+), 15 deletions(-)
 create mode 100644 src/climate_downscale/generate/cmip_daily.py

diff --git a/src/climate_downscale/generate/cmip_daily.py b/src/climate_downscale/generate/cmip_daily.py
new file mode 100644
index 0000000..86085e5
--- /dev/null
+++ b/src/climate_downscale/generate/cmip_daily.py
@@ -0,0 +1,43 @@
+import pandas as pd
+import xarray as xr
+from pathlib import Path
+import numpy as np
+import tqdm
+
+from climate_downscale.generate import utils
+
+TARGET_LON = xr.DataArray(
+    np.round(np.arange(-180.0, 180.0, 0.1, dtype="float32"), 1), dims="longitude"
+)
+TARGET_LAT = xr.DataArray(
+    np.round(np.arange(90.0, -90.1, -0.1, dtype="float32"), 1), dims="latitude"
+)
+
+variable = 'tas'
+scenario = 'ssp119'
+year = '2024'
+
+paths = sorted(list(Path("/mnt/share/erf/climate_downscale/extracted_data/cmip6").glob("tas_ssp119*.nc")))
+p = paths[0]
+
+def compute_anomaly(path, year):
+    reference_period = slice("2015-01-01", "2024-12-31")
+    ref = xr.open_dataset(p).sel(time=reference_period).compute().groupby("time.month").mean("time")
+    
+    time_slice = slice(f"{year}-01", f"{year}-12")
+    time_range = pd.date_range(f"{year}-01-01", f"{year}-12-31")
+    target = xr.open_dataset(p).sel(time=time_slice).compute()
+    target = target.assign_coords(time=pd.to_datetime(target.time.dt.date)).interp_calendar(time_range)
+    
+    anomaly = target.groupby('time.month') - ref
+    anomaly = anomaly.rename({'lat': 'latitude', 'lon': 'longitude'})
+    anomaly = anomaly.assign_coords(longitude=(anomaly.longitude + 180) % 360 - 180).sortby("longitude")
+    anomaly = utils.interpolate_to_target_latlon(anomaly, target_lat=TARGET_LAT, target_lon=TARGET_LON)
+
+    return anomaly
+
+a = 1 / len(paths) * compute_anomaly(paths[0], year)
+
+for p in tqdm.tqdm(paths[1:]):
+    a += 1 / len(paths) * compute_anomaly(p, year)
+
diff --git a/src/climate_downscale/generate/era5_daily.py b/src/climate_downscale/generate/era5_daily.py
index 54990cf..71a6601 100644
--- a/src/climate_downscale/generate/era5_daily.py
+++ b/src/climate_downscale/generate/era5_daily.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 
 import click
+import dask
 import numpy as np
 import pandas as pd
 import xarray as xr
@@ -78,9 +79,6 @@
         utils.daily_sum,
         (0, 0.1),
     ),
-}
-
-UNTESTED_TRANSFORM_MAP = {
     "heat_index": (
         ["2m_temperature", "2m_dewpoint_temperature"],
         lambda x, y: utils.daily_mean(utils.heat_index(x, y)),
@@ -124,10 +122,11 @@ def with_variable(
 
 
 def load_and_shift_longitude(ds_path: str | Path) -> xr.Dataset:
-    ds = xr.open_dataset(ds_path)
-    ds = ds.assign_coords(longitude=(ds.longitude + 180) % 360 - 180).sortby(
-        "longitude"
-    )
+    ds = xr.open_dataset(ds_path).chunk(time=24)    
+    with dask.config.set(**{'array.slicing.split_large_chunks': False}):
+        ds = ds.assign_coords(longitude=(ds.longitude + 180) % 360 - 180).sortby(
+            "longitude"
+        )
     return ds
 
 
@@ -167,9 +166,9 @@ def generate_era5_daily_main(
     source_variables, collapse_fun, (e_offset, e_scale) = TRANSFORM_MAP[target_variable]
 
     datasets = []
-    for month in range(1, 3):
+    for month in range(1, 13):
         month_str = f"{month:02d}"
-        print(f"loading single-levels for {month_str}")
+        print(f"loading single-levels for {month_str}")        
         single_level = [
             load_variable(sv, year, month_str, "single-levels")
             for sv in source_variables
@@ -180,16 +179,17 @@ def generate_era5_daily_main(
         ds = ds.assign(date=pd.to_datetime(ds.date))
 
         print("interpolating")
-        ds_land_res = utils.interpolate_to_target_latlon(ds, TARGET_LAT, TARGET_LON)
+        ds_land_res = utils.interpolate_to_target_latlon(ds, TARGET_LAT, TARGET_LON)        
 
         print(f"loading land for {month_str}")
         land = [load_variable(sv, year, month_str, "land") for sv in source_variables]
         print("collapsing")
-        ds_land = collapse_fun(*land).compute()  # type: ignore[operator]
+        with dask.config.set(**{'array.slicing.split_large_chunks': False}):
+            ds_land = collapse_fun(*land).compute()  # type: ignore[operator]
         ds_land = ds_land.assign(date=pd.to_datetime(ds_land.date))
 
         print("combining")
-        combined = ds_land.combine_first(ds_land_res)
+        combined = ds_land.combine_first(ds_land_res)        
         datasets.append(combined)
 
     ds_year = xr.concat(datasets, dim="date").sortby("date")
@@ -235,15 +235,15 @@ def generate_era5_daily(
         task_name="generate era5_daily",
         node_args={
             "year": years,
-            "variable": variables,
+            "target-variable": variables,
         },
         task_args={
             "output-dir": output_dir,
         },
         task_resources={
             "queue": queue,
-            "cores": 1,
-            "memory": "10G",
+            "cores": 5,
+            "memory": "100G",
             "runtime": "120m",
             "project": "proj_rapidresponse",
         },

From 51fd1bd0534069092401603ffa8b875cb065bf3e Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Fri, 14 Jun 2024 21:38:02 -0700
Subject: [PATCH 39/71] Change layout for era5 daily

---
 poetry.lock                                   | 116 +++++++++++++++++-
 pyproject.toml                                |   1 +
 src/climate_downscale/data.py                 |  73 ++++++-----
 src/climate_downscale/extract/era5.py         |  12 +-
 src/climate_downscale/generate/cmip6_daily.py | 103 ++++++++++++++++
 src/climate_downscale/generate/cmip_daily.py  |  43 -------
 src/climate_downscale/generate/era5_daily.py  |  57 ++++-----
 src/climate_downscale/generate/utils.py       |  46 ++++---
 8 files changed, 323 insertions(+), 128 deletions(-)
 create mode 100644 src/climate_downscale/generate/cmip6_daily.py
 delete mode 100644 src/climate_downscale/generate/cmip_daily.py

diff --git a/poetry.lock b/poetry.lock
index daae923..6c2ae67 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -449,6 +449,17 @@ click = ">=4.0"
 [package.extras]
 test = ["pytest-cov"]
 
+[[package]]
+name = "cloudpickle"
+version = "3.0.0"
+description = "Pickler class to extend the standard pickle.Pickler functionality"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "cloudpickle-3.0.0-py3-none-any.whl", hash = "sha256:246ee7d0c295602a036e86369c77fecda4ab17b506496730f2f576d9016fd9c7"},
+    {file = "cloudpickle-3.0.0.tar.gz", hash = "sha256:996d9a482c6fb4f33c1a35335cf8afd065d2a56e973270364840712d9131a882"},
+]
+
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -605,6 +616,35 @@ files = [
 docs = ["ipython", "matplotlib", "numpydoc", "sphinx"]
 tests = ["pytest", "pytest-cov", "pytest-xdist"]
 
+[[package]]
+name = "dask"
+version = "2024.5.2"
+description = "Parallel PyData with Task Scheduling"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "dask-2024.5.2-py3-none-any.whl", hash = "sha256:acc2cfe41d9e0151c216ac40396dbe34df13bc3d8c51dfece190349e4f2243af"},
+    {file = "dask-2024.5.2.tar.gz", hash = "sha256:5c9722c44d0195e78b6e54197aa3302e6fcaaac2310fd3014560bcb86253dcb3"},
+]
+
+[package.dependencies]
+click = ">=8.1"
+cloudpickle = ">=1.5.0"
+fsspec = ">=2021.09.0"
+importlib-metadata = {version = ">=4.13.0", markers = "python_version < \"3.12\""}
+packaging = ">=20.0"
+partd = ">=1.2.0"
+pyyaml = ">=5.3.1"
+toolz = ">=0.10.0"
+
+[package.extras]
+array = ["numpy (>=1.21)"]
+complete = ["dask[array,dataframe,diagnostics,distributed]", "lz4 (>=4.3.2)", "pyarrow (>=7.0)", "pyarrow-hotfix"]
+dataframe = ["dask-expr (>=1.1,<1.2)", "dask[array]", "pandas (>=1.3)"]
+diagnostics = ["bokeh (>=2.4.2)", "jinja2 (>=2.10.3)"]
+distributed = ["distributed (==2024.5.2)"]
+test = ["pandas[test]", "pre-commit", "pytest", "pytest-cov", "pytest-rerunfailures", "pytest-timeout", "pytest-xdist"]
+
 [[package]]
 name = "decorator"
 version = "5.1.1"
@@ -1290,6 +1330,25 @@ files = [
     {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
 ]
 
+[[package]]
+name = "importlib-metadata"
+version = "7.1.0"
+description = "Read metadata from Python packages"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "importlib_metadata-7.1.0-py3-none-any.whl", hash = "sha256:30962b96c0c223483ed6cc7280e7f0199feb01a0e40cfae4d4450fc6fab1f570"},
+    {file = "importlib_metadata-7.1.0.tar.gz", hash = "sha256:b78938b926ee8d5f020fc4772d487045805a55ddbad2ecf21c6d60938dc7fcd2"},
+]
+
+[package.dependencies]
+zipp = ">=0.5"
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+perf = ["ipython"]
+testing = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"]
+
 [[package]]
 name = "iniconfig"
 version = "2.0.0"
@@ -1442,6 +1501,17 @@ files = [
     {file = "kiwisolver-1.4.5.tar.gz", hash = "sha256:e57e563a57fb22a142da34f38acc2fc1a5c864bc29ca1517a88abc963e60d6ec"},
 ]
 
+[[package]]
+name = "locket"
+version = "1.0.0"
+description = "File-based locks for Python on Linux and Windows"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "locket-1.0.0-py2.py3-none-any.whl", hash = "sha256:b6c819a722f7b6bd955b80781788e4a66a55628b858d347536b7e81325a3a5e3"},
+    {file = "locket-1.0.0.tar.gz", hash = "sha256:5c0d4c052a8bbbf750e056a8e65ccd309086f4f0f18a2eac306a8dfa4112a632"},
+]
+
 [[package]]
 name = "loguru"
 version = "0.7.2"
@@ -2214,6 +2284,24 @@ sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-d
 test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"]
 xml = ["lxml (>=4.9.2)"]
 
+[[package]]
+name = "partd"
+version = "1.4.2"
+description = "Appendable key-value storage"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "partd-1.4.2-py3-none-any.whl", hash = "sha256:978e4ac767ec4ba5b86c6eaa52e5a2a3bc748a2ca839e8cc798f1cc6ce6efb0f"},
+    {file = "partd-1.4.2.tar.gz", hash = "sha256:d022c33afbdc8405c226621b015e8067888173d85f7f5ecebb3cafed9a20f02c"},
+]
+
+[package.dependencies]
+locket = "*"
+toolz = "*"
+
+[package.extras]
+complete = ["blosc", "numpy (>=1.20.0)", "pandas (>=1.3)", "pyzmq"]
+
 [[package]]
 name = "pathos"
 version = "0.3.2"
@@ -3278,6 +3366,17 @@ files = [
     {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
 ]
 
+[[package]]
+name = "toolz"
+version = "0.12.1"
+description = "List processing tools and functional utilities"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "toolz-0.12.1-py3-none-any.whl", hash = "sha256:d22731364c07d72eea0a0ad45bafb2c2937ab6fd38a3507bf55eae8744aa7d85"},
+    {file = "toolz-0.12.1.tar.gz", hash = "sha256:ecca342664893f177a13dac0e6b41cbd8ac25a358e5f215316d43e2100224f4d"},
+]
+
 [[package]]
 name = "tqdm"
 version = "4.66.4"
@@ -3596,7 +3695,22 @@ numpy = ">=1.23"
 docs = ["numcodecs[msgpack]", "numpydoc", "pydata-sphinx-theme", "sphinx", "sphinx-automodapi", "sphinx-copybutton", "sphinx-design", "sphinx-issues"]
 jupyter = ["ipytree (>=0.2.2)", "ipywidgets (>=8.0.0)", "notebook"]
 
+[[package]]
+name = "zipp"
+version = "3.19.2"
+description = "Backport of pathlib-compatible object wrapper for zip files"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "zipp-3.19.2-py3-none-any.whl", hash = "sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c"},
+    {file = "zipp-3.19.2.tar.gz", hash = "sha256:bf1dcf6450f873a13e952a29504887c89e6de7506209e5b1bcc3460135d4de19"},
+]
+
+[package.extras]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
+
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10, <3.13"
-content-hash = "d956b3098dcb83693feb9ac5cb4b39749dbd7ef6e90a8e2bd878ee7c3dc13f43"
+content-hash = "e0e7f81ba64d5f9ceee07a0a5635b84eb8f65a541f15dcc047f1f42e5ab21053"
diff --git a/pyproject.toml b/pyproject.toml
index b19f494..cc5fee9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,6 +49,7 @@ types-tqdm = "^4.66.0.20240417"
 gcsfs = "^2024.6.0"
 zarr = "^2.18.2"
 types-pyyaml = "^6.0.12.20240311"
+dask = "^2024.5.2"
 
 [tool.poetry.group.dev.dependencies]
 mkdocstrings = {version = ">=0.23", extras = ["python"]}
diff --git a/src/climate_downscale/data.py b/src/climate_downscale/data.py
index 839448f..ef77ebf 100644
--- a/src/climate_downscale/data.py
+++ b/src/climate_downscale/data.py
@@ -4,7 +4,7 @@
 import pandas as pd
 import rasterra as rt
 import xarray as xr
-from rra_tools.shell_tools import touch
+from rra_tools.shell_tools import mkdir, touch
 
 DEFAULT_ROOT = "/mnt/share/erf/climate_downscale/"
 
@@ -22,30 +22,29 @@ def root(self) -> Path:
     def credentials_root(self) -> Path:
         return self._credentials_root
 
+    ##################
+    # Extracted data #
+    ##################
+
     @property
     def extracted_data(self) -> Path:
         return self.root / "extracted_data"
 
     @property
-    def era5(self) -> Path:
+    def extracted_era5(self) -> Path:
         return self.extracted_data / "era5"
 
-    def era5_path(
+    def extracted_era5_path(
         self, dataset: str, variable: str, year: int | str, month: str
     ) -> Path:
-        return self.era5 / f"{dataset}_{variable}_{year}_{month}.nc"
-
-    def load_era5(
-        self, dataset: str, variable: str, year: int | str, month: str
-    ) -> xr.Dataset:
-        return xr.open_dataset(self.era5_path(dataset, variable, year, month))
+        return self.extracted_era5 / f"{dataset}_{variable}_{year}_{month}.nc"
 
     @property
-    def cmip6(self) -> Path:
+    def extracted_cmip6(self) -> Path:
         return self.extracted_data / "cmip6"
 
     def load_cmip6_metadata(self) -> pd.DataFrame:
-        meta_path = self.cmip6 / "cmip6-metadata.parquet"
+        meta_path = self.extracted_cmip6 / "cmip6-metadata.parquet"
         if not meta_path.exists():
             external_path = "https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv"
             meta = pd.read_csv(external_path)
@@ -53,15 +52,6 @@ def load_cmip6_metadata(self) -> pd.DataFrame:
             meta.to_parquet(meta_path)
         return pd.read_parquet(meta_path)
 
-    @property
-    def era5_temperature_daily_mean(self) -> Path:
-        return self.extracted_data / "era5_temperature_daily_mean"
-
-    def load_era5_temperature_daily_mean(self, year: int | str) -> xr.Dataset:
-        return xr.load_dataset(
-            self.era5_temperature_daily_mean / f"{year}_era5_temp_daily.nc"
-        )
-
     @property
     def ncei_climate_stations(self) -> Path:
         return self.extracted_data / "ncei_climate_stations"
@@ -74,10 +64,6 @@ def save_ncei_climate_stations(self, df: pd.DataFrame, year: int | str) -> None:
     def load_ncei_climate_stations(self, year: int | str) -> pd.DataFrame:
         return pd.read_parquet(self.ncei_climate_stations / f"{year}.parquet")
 
-    @property
-    def srtm_elevation_gl1(self) -> Path:
-        return self.extracted_data / "srtm_elevation_gl1"
-
     @property
     def open_topography_elevation(self) -> Path:
         return self.extracted_data / "open_topography_elevation"
@@ -86,6 +72,10 @@ def open_topography_elevation(self) -> Path:
     def rub_local_climate_zones(self) -> Path:
         return self.extracted_data / "rub_local_climate_zones"
 
+    ###################
+    # Downscale model #
+    ###################
+
     @property
     def downscale_model(self) -> Path:
         return self.root / "downscale_model"
@@ -119,21 +109,33 @@ def save_training_data(self, df: pd.DataFrame, year: int | str) -> None:
     def load_training_data(self, year: int | str) -> pd.DataFrame:
         return pd.read_parquet(self.training_data / f"{year}.parquet")
 
+    ###########
+    # Results #
+    ###########
+
     @property
     def results(self) -> Path:
         return self.root / "results"
 
     @property
-    def era5_daily(self) -> Path:
-        return self.results / "era5_daily"
+    def daily_results(self) -> Path:
+        return self.results / "daily"
 
-    def save_era5_daily(
+    def daily_results_path(self, scenario: str, variable: str, year: int | str) -> Path:
+        return self.daily_results / scenario / variable / f"{year}.nc"
+
+    def save_daily_results(
         self,
-        ds: xr.Dataset,
+        results_ds: xr.Dataset,
+        scenario: str,
         variable: str,
         year: int | str,
         **encoding_kwargs: Any,
-    ) -> None:
+    ):
+        path = self.daily_results_path(scenario, variable, year)
+        mkdir(path.parent, exist_ok=True, parents=True)
+        touch(path, exist_ok=True)
+
         encoding = {
             "dtype": "int16",
             "_FillValue": -32767,
@@ -141,9 +143,16 @@ def save_era5_daily(
             "complevel": 1,
         }
         encoding.update(encoding_kwargs)
-        path = self.era5_daily / f"{variable}_{year}.nc"
-        touch(path, exist_ok=True)
-        ds.to_netcdf(path, encoding={"value": encoding})
+        results_ds.to_netcdf(path, encoding={"value": encoding})
+
+    def load_daily_results(
+        self,
+        scenario: str,
+        variable: str,
+        year: int | str,
+    ) -> xr.Dataset:
+        results_path = self.daily_results_path(scenario, variable, year)
+        return xr.open_dataset(results_path)
 
 
 def save_raster(
diff --git a/src/climate_downscale/extract/era5.py b/src/climate_downscale/extract/era5.py
index 058c7d8..95f49f2 100644
--- a/src/climate_downscale/extract/era5.py
+++ b/src/climate_downscale/extract/era5.py
@@ -35,7 +35,9 @@ def download_era5_main(
 ) -> None:
     cddata = ClimateDownscaleData(output_dir)
 
-    final_out_path = cddata.era5_path(era5_dataset, era5_variable, year, month)
+    final_out_path = cddata.extracted_era5_path(
+        era5_dataset, era5_variable, year, month
+    )
     download_path, download_format = get_download_spec(final_out_path)
 
     if download_path.exists():
@@ -84,7 +86,9 @@ def unzip_and_compress_era5(
     month: str,
 ) -> None:
     cddata = ClimateDownscaleData(output_dir)
-    final_out_path = cddata.era5_path(era5_dataset, era5_variable, year, month)
+    final_out_path = cddata.extracted_era5_path(
+        era5_dataset, era5_variable, year, month
+    )
     zip_path = final_out_path.with_suffix(".zip")
     uncompressed_path = final_out_path.with_stem(f"{final_out_path.stem}_raw")
 
@@ -214,7 +218,7 @@ def extract_era5(  # noqa: PLR0913
     to_compress = []
     complete = []
     for spec in itertools.product(datasets, variables, years, months):
-        final_out_path = cddata.era5_path(*spec)
+        final_out_path = cddata.extracted_era5_path(*spec)
         download_path, _ = get_download_spec(final_out_path)
 
         if final_out_path.exists() and download_path.exists():
@@ -250,7 +254,7 @@ def extract_era5(  # noqa: PLR0913
             for user in users:
                 if to_download:
                     download_batch.append((*to_download.pop(), user))
-        if not len(download_batch) == min(len(users) * jobs_per_user, downloads_left):
+        if len(download_batch) != min(len(users) * jobs_per_user, downloads_left):
             msg = "Download batch size is incorrect"
             raise ValueError(msg)
 
diff --git a/src/climate_downscale/generate/cmip6_daily.py b/src/climate_downscale/generate/cmip6_daily.py
new file mode 100644
index 0000000..aa148a2
--- /dev/null
+++ b/src/climate_downscale/generate/cmip6_daily.py
@@ -0,0 +1,103 @@
+from pathlib import Path
+
+import pandas as pd
+import tqdm
+import xarray as xr
+
+from climate_downscale.data import ClimateDownscaleData
+from climate_downscale.generate import utils
+
+
+def compute_anomaly(
+    reference: xr.DataArray, target: xr.DataArray, anomaly_type: str
+) -> xr.Dataset:
+    if anomaly_type == "additive":
+        anomaly = target.groupby("time.month") - reference
+    elif anomaly_type == "multiplicative":
+        anomaly = (target.groupby("time.month") + 1) / (reference + 1)
+    else:
+        msg = f"Unknown anomaly type: {anomaly_type}"
+        raise ValueError(msg)
+
+    anomaly = (
+        anomaly.drop_vars("month")
+        .rename({"lat": "latitude", "lon": "longitude", "time": "date"})
+        .assign_coords(longitude=(anomaly.longitude + 180) % 360 - 180)
+        .sortby("longitude")
+    )
+    anomaly = utils.interpolate_to_target_latlon(anomaly)
+    return anomaly
+
+
+TRANSFORM_MAP = {
+    "tas": (utils.kelvin_to_celsius, "additive"),
+    "pr": (utils.precipitation_flux_to_rainfall, "multiplicative"),
+}
+
+
+def load_reference_and_target(
+    path: str | Path, year: str | int
+) -> tuple[xr.Dataset, xr.Dataset]:
+    reference = (
+        xr.open_dataset(path)
+        .sel(time=utils.REFERENCE_PERIOD)
+        .compute()  # Load the subset before computing the mean, otherwise it's slow
+        .groupby("time.month")
+        .mean("time")
+    )
+
+    time_slice = slice(f"{year}-01", f"{year}-12")
+    time_range = pd.date_range(f"{year}-01-01", f"{year}-12-31")
+    target = xr.open_dataset(path).sel(time=time_slice).compute()
+    target = (
+        target.assign_coords(time=target.time.dt.floor("D"))
+        .interp_calendar(time_range)
+        .interpolate_na(dim="time", method="nearest", fill_value="extrapolate")
+    )
+    return reference, target
+
+
+def generate_cmip6_daily_main(
+    output_dir: str | Path,
+    year: str | int,
+    target_variable: str,
+    cmip_scenario: str,
+    rerefk,
+) -> None:
+    cd_data = ClimateDownscaleData(output_dir)
+    paths = cd_data.cmip6.glob(f"{target_variable}_{cmip_scenario}*.nc")
+
+
+def compute_anomaly(path, year):
+    reference_period = slice("2015-01-01", "2024-12-31")
+
+    anomaly = target.groupby("time.month") - ref
+    anomaly = anomaly.rename({"lat": "latitude", "lon": "longitude"})
+    anomaly = anomaly.assign_coords(
+        longitude=(anomaly.longitude + 180) % 360 - 180
+    ).sortby("longitude")
+    anomaly = utils.interpolate_to_target_latlon(
+        anomaly, target_lat=TARGET_LAT, target_lon=TARGET_LON
+    )
+
+    return anomaly
+
+
+variable = "tas"
+scenario = "ssp119"
+year = "2024"
+
+paths = sorted(
+    list(
+        Path("/mnt/share/erf/climate_downscale/extracted_data/cmip6").glob(
+            "tas_ssp119*.nc"
+        )
+    )
+)
+p = paths[0]
+
+
+a = 1 / len(paths) * compute_anomaly(paths[0], year)
+
+for p in tqdm.tqdm(paths[1:]):
+    a += 1 / len(paths) * compute_anomaly(p, year)
diff --git a/src/climate_downscale/generate/cmip_daily.py b/src/climate_downscale/generate/cmip_daily.py
deleted file mode 100644
index 86085e5..0000000
--- a/src/climate_downscale/generate/cmip_daily.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import pandas as pd
-import xarray as xr
-from pathlib import Path
-import numpy as np
-import tqdm
-
-from climate_downscale.generate import utils
-
-TARGET_LON = xr.DataArray(
-    np.round(np.arange(-180.0, 180.0, 0.1, dtype="float32"), 1), dims="longitude"
-)
-TARGET_LAT = xr.DataArray(
-    np.round(np.arange(90.0, -90.1, -0.1, dtype="float32"), 1), dims="latitude"
-)
-
-variable = 'tas'
-scenario = 'ssp119'
-year = '2024'
-
-paths = sorted(list(Path("/mnt/share/erf/climate_downscale/extracted_data/cmip6").glob("tas_ssp119*.nc")))
-p = paths[0]
-
-def compute_anomaly(path, year):
-    reference_period = slice("2015-01-01", "2024-12-31")
-    ref = xr.open_dataset(p).sel(time=reference_period).compute().groupby("time.month").mean("time")
-    
-    time_slice = slice(f"{year}-01", f"{year}-12")
-    time_range = pd.date_range(f"{year}-01-01", f"{year}-12-31")
-    target = xr.open_dataset(p).sel(time=time_slice).compute()
-    target = target.assign_coords(time=pd.to_datetime(target.time.dt.date)).interp_calendar(time_range)
-    
-    anomaly = target.groupby('time.month') - ref
-    anomaly = anomaly.rename({'lat': 'latitude', 'lon': 'longitude'})
-    anomaly = anomaly.assign_coords(longitude=(anomaly.longitude + 180) % 360 - 180).sortby("longitude")
-    anomaly = utils.interpolate_to_target_latlon(anomaly, target_lat=TARGET_LAT, target_lon=TARGET_LON)
-
-    return anomaly
-
-a = 1 / len(paths) * compute_anomaly(paths[0], year)
-
-for p in tqdm.tqdm(paths[1:]):
-    a += 1 / len(paths) * compute_anomaly(p, year)
-
diff --git a/src/climate_downscale/generate/era5_daily.py b/src/climate_downscale/generate/era5_daily.py
index 71a6601..34b2a24 100644
--- a/src/climate_downscale/generate/era5_daily.py
+++ b/src/climate_downscale/generate/era5_daily.py
@@ -3,7 +3,6 @@
 
 import click
 import dask
-import numpy as np
 import pandas as pd
 import xarray as xr
 from rra_tools import jobmon
@@ -12,13 +11,6 @@
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
 from climate_downscale.generate import utils
 
-TARGET_LON = xr.DataArray(
-    np.round(np.arange(-180.0, 180.0, 0.1, dtype="float32"), 1), dims="longitude"
-)
-TARGET_LAT = xr.DataArray(
-    np.round(np.arange(90.0, -90.1, -0.1, dtype="float32"), 1), dims="latitude"
-)
-
 # Map from source variable to a unit conversion function
 CONVERT_MAP = {
     "10m_u_component_of_wind": utils.scale_wind_speed_height,
@@ -54,15 +46,10 @@
         utils.daily_min,
         (273.15, 0.01),
     ),
-    "cooling_degree_days": (
-        ["2m_temperature"],
-        utils.cdd,
-        (0, 0.01),
-    ),
-    "heating_degree_days": (
-        ["2m_temperature"],
-        utils.hdd,
-        (0, 0.01),
+    "dewpoint_temperature": (
+        ["2m_dewpoint_temperature"],
+        utils.daily_mean,
+        (273.15, 0.01),
     ),
     "wind_speed": (
         ["10m_u_component_of_wind", "10m_v_component_of_wind"],
@@ -79,6 +66,9 @@
         utils.daily_sum,
         (0, 0.1),
     ),
+}
+
+ADDITIONAL_TRANSFORM_MAP = {
     "heat_index": (
         ["2m_temperature", "2m_dewpoint_temperature"],
         lambda x, y: utils.daily_mean(utils.heat_index(x, y)),
@@ -122,8 +112,8 @@ def with_variable(
 
 
 def load_and_shift_longitude(ds_path: str | Path) -> xr.Dataset:
-    ds = xr.open_dataset(ds_path).chunk(time=24)    
-    with dask.config.set(**{'array.slicing.split_large_chunks': False}):
+    ds = xr.open_dataset(ds_path).chunk(time=24)
+    with dask.config.set(**{"array.slicing.split_large_chunks": False}):
         ds = ds.assign_coords(longitude=(ds.longitude + 180) % 360 - 180).sortby(
             "longitude"
         )
@@ -142,14 +132,10 @@ def load_variable(
         raise NotImplementedError
         # Substitute the single level dataset pre-interpolated at the target resolution.
         p = root / f"reanalysis-era5-single-levels_{variable}_{year}_{month}.nc"
-        ds = utils.interpolate_to_target_latlon(
-            load_and_shift_longitude(p),
-            target_lat=TARGET_LAT,
-            target_lon=TARGET_LON,
-        )
+        ds = utils.interpolate_to_target_latlon(load_and_shift_longitude(p))
     elif dataset == "land":
         ds = load_and_shift_longitude(p).assign_coords(
-            latitude=TARGET_LAT, longitude=TARGET_LON
+            latitude=utils.TARGET_LAT, longitude=utils.TARGET_LON
         )
     else:
         ds = load_and_shift_longitude(p)
@@ -168,7 +154,7 @@ def generate_era5_daily_main(
     datasets = []
     for month in range(1, 13):
         month_str = f"{month:02d}"
-        print(f"loading single-levels for {month_str}")        
+        print(f"loading single-levels for {month_str}")
         single_level = [
             load_variable(sv, year, month_str, "single-levels")
             for sv in source_variables
@@ -179,24 +165,29 @@ def generate_era5_daily_main(
         ds = ds.assign(date=pd.to_datetime(ds.date))
 
         print("interpolating")
-        ds_land_res = utils.interpolate_to_target_latlon(ds, TARGET_LAT, TARGET_LON)        
+        ds_land_res = utils.interpolate_to_target_latlon(ds)
 
         print(f"loading land for {month_str}")
         land = [load_variable(sv, year, month_str, "land") for sv in source_variables]
         print("collapsing")
-        with dask.config.set(**{'array.slicing.split_large_chunks': False}):
+        with dask.config.set(**{"array.slicing.split_large_chunks": False}):
             ds_land = collapse_fun(*land).compute()  # type: ignore[operator]
         ds_land = ds_land.assign(date=pd.to_datetime(ds_land.date))
 
         print("combining")
-        combined = ds_land.combine_first(ds_land_res)        
+        combined = ds_land.combine_first(ds_land_res)
         datasets.append(combined)
 
     ds_year = xr.concat(datasets, dim="date").sortby("date")
 
     cd_data = ClimateDownscaleData(output_dir)
-    cd_data.save_era5_daily(
-        ds_year, target_variable, year, add_offset=e_offset, scale_factor=e_scale
+    cd_data.save_daily_results(
+        ds_year,
+        scenario="historical",
+        variable=target_variable,
+        year=year,
+        add_offset=e_offset,
+        scale_factor=e_scale,
     )
 
 
@@ -232,7 +223,7 @@ def generate_era5_daily(
 
     jobmon.run_parallel(
         runner="cdtask",
-        task_name="generate era5_daily",
+        task_name="generate historical_daily",
         node_args={
             "year": years,
             "target-variable": variables,
@@ -243,7 +234,7 @@ def generate_era5_daily(
         task_resources={
             "queue": queue,
             "cores": 5,
-            "memory": "100G",
+            "memory": "200G",
             "runtime": "120m",
             "project": "proj_rapidresponse",
         },
diff --git a/src/climate_downscale/generate/utils.py b/src/climate_downscale/generate/utils.py
index 44de6d3..a0ed5b4 100644
--- a/src/climate_downscale/generate/utils.py
+++ b/src/climate_downscale/generate/utils.py
@@ -1,6 +1,14 @@
 import numpy as np
 import xarray as xr
 
+REFERENCE_PERIOD = slice("2015-01-01", "2024-12-31")
+TARGET_LON = xr.DataArray(
+    np.round(np.arange(-180.0, 180.0, 0.1, dtype="float32"), 1), dims="longitude"
+)
+TARGET_LAT = xr.DataArray(
+    np.round(np.arange(90.0, -90.1, -0.1, dtype="float32"), 1), dims="latitude"
+)
+
 #############################
 # Standard unit conversions #
 #############################
@@ -38,6 +46,24 @@ def meter_to_millimeter(rainfall_m: xr.Dataset) -> xr.Dataset:
     return 1000 * rainfall_m
 
 
+def precipitation_flux_to_rainfall(precipitation_flux: xr.Dataset) -> xr.Dataset:
+    """Convert precipitation flux to rainfall
+
+    Parameters
+    ----------
+    precipitation_flux
+        Precipitation flux in kg m-2 s-1
+
+    Returns
+    -------
+    xr.Dataset
+        Rainfall in mm/day
+    """
+    seconds_per_day = 86400
+    mm_per_kg_m2 = 1
+    return seconds_per_day * mm_per_kg_m2 * precipitation_flux  # type: ignore[no-any-return]k
+
+
 def scale_wind_speed_height(wind_speed_10m: xr.Dataset) -> xr.Dataset:
     """Scaling wind speed from a height of 10 meters to a height of 2 meters
 
@@ -89,16 +115,6 @@ def daily_sum(ds: xr.Dataset) -> xr.Dataset:
 ########################
 
 
-def cdd(temperature_c: xr.Dataset) -> xr.Dataset:
-    """Calculate cooling degree days"""
-    return daily_mean(np.maximum(temperature_c - 18, 0))  # type: ignore[call-overload]
-
-
-def hdd(temperature_c: xr.Dataset) -> xr.Dataset:
-    """Calculate heating degree days"""
-    return daily_mean(np.maximum(18 - temperature_c, 0))  # type: ignore[call-overload]
-
-
 def vector_magnitude(x: xr.Dataset, y: xr.Dataset) -> xr.Dataset:
     """Calculate the magnitude of a vector."""
     return np.sqrt(x**2 + y**2)  # type: ignore[no-any-return]
@@ -155,7 +171,8 @@ def rh_percent(
 
 
 def heat_index(
-    temperature_c: xr.Dataset, dewpoint_temperature_c: xr.Dataset
+    temperature_c: xr.Dataset,
+    dewpoint_temperature_c: xr.Dataset,
 ) -> xr.Dataset:
     """Calculate the heat index.
 
@@ -195,7 +212,8 @@ def heat_index(
 
 
 def humidex(
-    temperature_c: xr.Dataset, dewpoint_temperature_c: xr.Dataset
+    temperature_c: xr.Dataset,
+    dewpoint_temperature_c: xr.Dataset,
 ) -> xr.Dataset:
     """Calculate the humidex.
 
@@ -269,9 +287,7 @@ def rename_val_column(ds: xr.Dataset) -> xr.Dataset:
 
 def interpolate_to_target_latlon(
     ds: xr.Dataset,
-    target_lat: xr.DataArray,
-    target_lon: xr.DataArray,
 ) -> xr.Dataset:
     return ds.interp(
-        longitude=target_lon, latitude=target_lat, method="nearest"
+        longitude=TARGET_LON, latitude=TARGET_LAT, method="nearest"
     ).interpolate_na(dim="longitude", method="nearest", fill_value="extrapolate")

From c61f2c6b141ec942d0b779c3afca3b609d1f2fe0 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Fri, 14 Jun 2024 21:44:35 -0700
Subject: [PATCH 40/71] Change era5_daily to historical_daily

---
 src/climate_downscale/generate/__init__.py             | 10 +++++-----
 .../generate/{era5_daily.py => historical_daily.py}    |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)
 rename src/climate_downscale/generate/{era5_daily.py => historical_daily.py} (97%)

diff --git a/src/climate_downscale/generate/__init__.py b/src/climate_downscale/generate/__init__.py
index 21710f6..675c482 100644
--- a/src/climate_downscale/generate/__init__.py
+++ b/src/climate_downscale/generate/__init__.py
@@ -1,12 +1,12 @@
-from climate_downscale.generate.era5_daily import (
-    generate_era5_daily,
-    generate_era5_daily_task,
+from climate_downscale.generate.historical_daily import (
+    generate_historical_daily,
+    generate_historical_daily_task,
 )
 
 RUNNERS = {
-    "era5_daily": generate_era5_daily,
+    "historical_daily": generate_historical_daily,
 }
 
 TASK_RUNNERS = {
-    "era5_daily": generate_era5_daily_task,
+    "historical_daily": generate_historical_daily_task,
 }
diff --git a/src/climate_downscale/generate/era5_daily.py b/src/climate_downscale/generate/historical_daily.py
similarity index 97%
rename from src/climate_downscale/generate/era5_daily.py
rename to src/climate_downscale/generate/historical_daily.py
index 34b2a24..4d051f9 100644
--- a/src/climate_downscale/generate/era5_daily.py
+++ b/src/climate_downscale/generate/historical_daily.py
@@ -144,7 +144,7 @@ def load_variable(
     return ds
 
 
-def generate_era5_daily_main(
+def generate_historical_daily_main(
     output_dir: str | Path,
     year: str,
     target_variable: str,
@@ -195,12 +195,12 @@ def generate_era5_daily_main(
 @clio.with_output_directory(DEFAULT_ROOT)
 @clio.with_year()
 @with_variable()
-def generate_era5_daily_task(
+def generate_historical_daily_task(
     output_dir: str,
     year: str,
     target_variable: str,
 ) -> None:
-    generate_era5_daily_main(output_dir, year, target_variable)
+    generate_historical_daily_main(output_dir, year, target_variable)
 
 
 @click.command()  # type: ignore[arg-type]
@@ -208,7 +208,7 @@ def generate_era5_daily_task(
 @clio.with_year(allow_all=True)
 @with_variable(allow_all=True)
 @clio.with_queue()
-def generate_era5_daily(
+def generate_historical_daily(
     output_dir: str,
     year: str,
     target_variable: str,

From 7f177c5f42a73603bdb33f9cb1a6068a13fef804 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Sat, 15 Jun 2024 08:19:55 -0700
Subject: [PATCH 41/71] Add overwrite

---
 src/climate_downscale/cli_options.py          | 10 +++++++
 .../generate/historical_daily.py              | 26 ++++++++++++++++---
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/src/climate_downscale/cli_options.py b/src/climate_downscale/cli_options.py
index 879465e..31cc067 100644
--- a/src/climate_downscale/cli_options.py
+++ b/src/climate_downscale/cli_options.py
@@ -1,5 +1,6 @@
 from typing import ParamSpec, TypeVar
 
+import click
 from rra_tools.cli_tools import (
     RUN_ALL,
     ClickOption,
@@ -192,6 +193,14 @@ def with_lon_start(
     )
 
 
+def with_overwrite() -> ClickOption[_P, _T]:
+    return click.option(
+        "--overwrite",
+        is_flag=True,
+        help="Overwrite existing files.",
+    )
+
+
 __all__ = [
     "VALID_YEARS",
     "VALID_MONTHS",
@@ -222,4 +231,5 @@ def with_lon_start(
     "RUN_ALL",
     "ClickOption",
     "with_choice",
+    "with_overwrite",
 ]
diff --git a/src/climate_downscale/generate/historical_daily.py b/src/climate_downscale/generate/historical_daily.py
index 4d051f9..6b39bc4 100644
--- a/src/climate_downscale/generate/historical_daily.py
+++ b/src/climate_downscale/generate/historical_daily.py
@@ -1,3 +1,4 @@
+import itertools
 import typing
 from pathlib import Path
 
@@ -208,26 +209,43 @@ def generate_historical_daily_task(
 @clio.with_year(allow_all=True)
 @with_variable(allow_all=True)
 @clio.with_queue()
+@clio.with_overwrite()
 def generate_historical_daily(
     output_dir: str,
     year: str,
     target_variable: str,
     queue: str,
+    overwrite: bool,
 ) -> None:
+    cd_data = ClimateDownscaleData(output_dir)
+
     years = clio.VALID_YEARS if year == clio.RUN_ALL else [year]
     variables = (
         list(TRANSFORM_MAP.keys())
         if target_variable == clio.RUN_ALL
         else [target_variable]
     )
+    years_and_variables = []
+    complete = []
+    for y, v in itertools.product(years, variables):
+        path = cd_data.daily_results_path("historical", v, y)
+        if not path.exists() or overwrite:
+            years_and_variables.append((y, v))
+        else:
+            complete.append((y, v))
+
+    print(
+        f"{len(complete)} tasks already done. "
+        f"Launching {len(years_and_variables)} tasks"
+    )
 
     jobmon.run_parallel(
         runner="cdtask",
         task_name="generate historical_daily",
-        node_args={
-            "year": years,
-            "target-variable": variables,
-        },
+        flat_node_args=(
+            ("year", "target_variable"),
+            years_and_variables,
+        ),
         task_args={
             "output-dir": output_dir,
         },

From 9309a64b6b2653a9a7389e12a954d69155e38c18 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Sat, 15 Jun 2024 08:20:20 -0700
Subject: [PATCH 42/71] Bump runtime

---
 src/climate_downscale/generate/historical_daily.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/climate_downscale/generate/historical_daily.py b/src/climate_downscale/generate/historical_daily.py
index 6b39bc4..a5d3ffa 100644
--- a/src/climate_downscale/generate/historical_daily.py
+++ b/src/climate_downscale/generate/historical_daily.py
@@ -253,7 +253,7 @@ def generate_historical_daily(
             "queue": queue,
             "cores": 5,
             "memory": "200G",
-            "runtime": "120m",
+            "runtime": "240m",
             "project": "proj_rapidresponse",
         },
         max_attempts=1,

From d37be57d6291f55deb722608c8cb1e37be5ce315 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Sat, 15 Jun 2024 09:18:06 -0700
Subject: [PATCH 43/71] Add worflow to generate historical reference

---
 src/climate_downscale/data.py                 |  2 +-
 src/climate_downscale/generate/__init__.py    |  6 ++
 .../generate/historical_daily.py              | 12 +--
 .../generate/historical_reference.py          | 81 +++++++++++++++++++
 src/climate_downscale/generate/utils.py       |  3 +-
 5 files changed, 97 insertions(+), 7 deletions(-)
 create mode 100644 src/climate_downscale/generate/historical_reference.py

diff --git a/src/climate_downscale/data.py b/src/climate_downscale/data.py
index ef77ebf..4e6e3f9 100644
--- a/src/climate_downscale/data.py
+++ b/src/climate_downscale/data.py
@@ -130,7 +130,7 @@ def save_daily_results(
         scenario: str,
         variable: str,
         year: int | str,
-        **encoding_kwargs: Any,
+        encoding_kwargs: dict[str, Any],
     ):
         path = self.daily_results_path(scenario, variable, year)
         mkdir(path.parent, exist_ok=True, parents=True)
diff --git a/src/climate_downscale/generate/__init__.py b/src/climate_downscale/generate/__init__.py
index 675c482..022426b 100644
--- a/src/climate_downscale/generate/__init__.py
+++ b/src/climate_downscale/generate/__init__.py
@@ -2,11 +2,17 @@
     generate_historical_daily,
     generate_historical_daily_task,
 )
+from climate_downscale.generate.historical_reference import (
+    generate_historical_reference,
+    generate_historical_reference_task,
+)
 
 RUNNERS = {
     "historical_daily": generate_historical_daily,
+    "historical_reference": generate_historical_reference,
 }
 
 TASK_RUNNERS = {
     "historical_daily": generate_historical_daily_task,
+    "historical_reference": generate_historical_reference_task,
 }
diff --git a/src/climate_downscale/generate/historical_daily.py b/src/climate_downscale/generate/historical_daily.py
index a5d3ffa..979312a 100644
--- a/src/climate_downscale/generate/historical_daily.py
+++ b/src/climate_downscale/generate/historical_daily.py
@@ -99,7 +99,7 @@
 _T = typing.TypeVar("_T")
 
 
-def with_variable(
+def with_target_variable(
     *,
     allow_all: bool = False,
 ) -> clio.ClickOption[_P, _T]:
@@ -187,15 +187,17 @@ def generate_historical_daily_main(
         scenario="historical",
         variable=target_variable,
         year=year,
-        add_offset=e_offset,
-        scale_factor=e_scale,
+        encoding_kwargs={
+            "add_offset": e_offset,
+            "scale_factor": e_scale,
+        },
     )
 
 
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
 @clio.with_year()
-@with_variable()
+@with_target_variable()
 def generate_historical_daily_task(
     output_dir: str,
     year: str,
@@ -207,7 +209,7 @@ def generate_historical_daily_task(
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
 @clio.with_year(allow_all=True)
-@with_variable(allow_all=True)
+@with_target_variable(allow_all=True)
 @clio.with_queue()
 @clio.with_overwrite()
 def generate_historical_daily(
diff --git a/src/climate_downscale/generate/historical_reference.py b/src/climate_downscale/generate/historical_reference.py
new file mode 100644
index 0000000..6fee78d
--- /dev/null
+++ b/src/climate_downscale/generate/historical_reference.py
@@ -0,0 +1,81 @@
+import click
+import xarray as xr
+from rra_tools import jobmon
+
+from climate_downscale import cli_options as clio
+from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
+from climate_downscale.generate import utils
+from climate_downscale.generate.historical_daily import (
+    TRANSFORM_MAP,
+    with_target_variable,
+)
+
+
+def generate_historical_reference_main(
+    output_dir: str,
+    target_variable: str,
+) -> None:
+    cd_data = ClimateDownscaleData(output_dir)
+    paths = [
+        cd_data.daily_results_path("historical", target_variable, year)
+        for year in utils.REFERENCE_YEARS
+    ]
+
+    reference_data = []
+    for path in paths:
+        ds = xr.load_dataset(path).groupby("time.month").mean("time")
+        reference_data.append(ds)
+
+    encoding_kwargs = xr.open_dataset(paths[0])["value"].encoding
+
+    reference = sum(reference_data) / len(reference_data)
+    cd_data.save_daily_results(
+        reference,
+        scenario="historical",
+        variable=target_variable,
+        year="reference",
+        encoding_kwargs=encoding_kwargs,
+    )
+
+
+@click.command()
+@clio.with_output_directory(DEFAULT_ROOT)
+@with_target_variable()
+def generate_historical_reference_task(
+    output_dir: str,
+    target_variable: str,
+) -> None:
+    generate_historical_reference_main(output_dir, target_variable)
+
+
+@click.command()
+@clio.with_output_directory(DEFAULT_ROOT)
+@with_target_variable(allow_all=True)
+@clio.with_queue()
+def generate_historical_reference(
+    output_dir: str,
+    target_variable: str,
+    queue: str,
+) -> None:
+    variables = (
+        list(TRANSFORM_MAP) if target_variable == clio.RUN_ALL else [target_variable]
+    )
+
+    jobmon.run_parallel(
+        runner="cdtask",
+        task_name="generate historical_reference",
+        node_args={
+            "target-variable": variables,
+        },
+        task_args={
+            "output-dir": output_dir,
+        },
+        task_resources={
+            "queue": queue,
+            "cores": 1,
+            "memory": "100G",
+            "runtime": "240m",
+            "project": "proj_rapidresponse",
+        },
+        max_attempts=1,
+    )
diff --git a/src/climate_downscale/generate/utils.py b/src/climate_downscale/generate/utils.py
index a0ed5b4..f87d14a 100644
--- a/src/climate_downscale/generate/utils.py
+++ b/src/climate_downscale/generate/utils.py
@@ -1,7 +1,8 @@
 import numpy as np
 import xarray as xr
 
-REFERENCE_PERIOD = slice("2015-01-01", "2024-12-31")
+REFERENCE_YEARS = list(range(2018, 2024))
+REFERENCE_PERIOD = slice(f"{REFERENCE_YEARS[0]}-01-01", f"{REFERENCE_YEARS[-1]}-12-31")
 TARGET_LON = xr.DataArray(
     np.round(np.arange(-180.0, 180.0, 0.1, dtype="float32"), 1), dims="longitude"
 )

From 018851877525cf808bd56de70a2d737a459b2257 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Sat, 15 Jun 2024 18:21:44 -0700
Subject: [PATCH 44/71] typo

---
 src/climate_downscale/generate/historical_daily.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/climate_downscale/generate/historical_daily.py b/src/climate_downscale/generate/historical_daily.py
index a5d3ffa..362ae39 100644
--- a/src/climate_downscale/generate/historical_daily.py
+++ b/src/climate_downscale/generate/historical_daily.py
@@ -243,7 +243,7 @@ def generate_historical_daily(
         runner="cdtask",
         task_name="generate historical_daily",
         flat_node_args=(
-            ("year", "target_variable"),
+            ("year", "target-variable"),
             years_and_variables,
         ),
         task_args={

From ec7cf80817f76aec3be995735c86074812c4cb54 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Sat, 15 Jun 2024 18:34:43 -0700
Subject: [PATCH 45/71] Add tasmin/tasmax, overwrite option, and some
 robustness

---
 src/climate_downscale/cli_options.py          |  2 +
 src/climate_downscale/data.py                 |  3 +
 src/climate_downscale/extract/cmip6.py        | 61 +++++++++++++------
 .../{cmip6_daily.py => scenario_daily.py}     | 10 +--
 4 files changed, 52 insertions(+), 24 deletions(-)
 rename src/climate_downscale/generate/{cmip6_daily.py => scenario_daily.py} (99%)

diff --git a/src/climate_downscale/cli_options.py b/src/climate_downscale/cli_options.py
index 31cc067..38117ce 100644
--- a/src/climate_downscale/cli_options.py
+++ b/src/climate_downscale/cli_options.py
@@ -147,6 +147,8 @@ def with_cmip6_experiment(
     "vas",
     "hurs",
     "tas",
+    "tasmin",
+    "tasmax",
     "pr",
 ]
 
diff --git a/src/climate_downscale/data.py b/src/climate_downscale/data.py
index 4e6e3f9..6af375b 100644
--- a/src/climate_downscale/data.py
+++ b/src/climate_downscale/data.py
@@ -52,6 +52,9 @@ def load_cmip6_metadata(self) -> pd.DataFrame:
             meta.to_parquet(meta_path)
         return pd.read_parquet(meta_path)
 
+    def extracted_cmip6_path(self, variable: str, experiment: str, source: str, member: str) -> Path:
+        return self.extracted_cmip6 / f"{variable}_{experiment}_{source}_{member}.nc"
+
     @property
     def ncei_climate_stations(self) -> Path:
         return self.extracted_data / "ncei_climate_stations"
diff --git a/src/climate_downscale/extract/cmip6.py b/src/climate_downscale/extract/cmip6.py
index eeb8863..e032658 100644
--- a/src/climate_downscale/extract/cmip6.py
+++ b/src/climate_downscale/extract/cmip6.py
@@ -1,3 +1,4 @@
+import itertools
 from pathlib import Path
 
 import click
@@ -13,6 +14,8 @@
     "vas": (0.0, 0.01),
     "hurs": (0.0, 0.01),
     "tas": (273.15, 0.01),
+    "tasmin": (273.15, 0.01),
+    "tasmax": (273.15, 0.01),
     "pr": (0.0, 1e-9),
 }
 
@@ -34,6 +37,7 @@ def extract_cmip6_main(
     cmip6_source: str,
     cmip6_experiment: str,
     cmip6_variable: str,
+    overwrite: bool,
 ) -> None:
     print(f"Checking metadata for {cmip6_source} {cmip6_experiment} {cmip6_variable}")
     cd_data = ClimateDownscaleData(output_dir)
@@ -50,26 +54,40 @@ def extract_cmip6_main(
     print(f"Extracting {len(meta_subset)} members...")
 
     for member, zstore_path in meta_subset.items():
-        print("Extracting", member, zstore_path)
-        cmip_data = load_cmip_data(zstore_path)
-        out_filename = f"{cmip6_variable}_{cmip6_experiment}_{cmip6_source}_{member}.nc"
-        out_path = cd_data.cmip6 / out_filename
-        shell_tools.touch(out_path, exist_ok=True)
-        shift, scale = VARIABLE_ENCODINGS[cmip6_variable]
-        print("Writing to", out_path)
-        cmip_data.to_netcdf(
-            out_path,
-            encoding={
-                cmip6_variable: {
-                    "dtype": "int16",
-                    "scale_factor": scale,
-                    "add_offset": shift,
-                    "_FillValue": -32767,
-                    "zlib": True,
-                    "complevel": 1,
-                }
-            },
+        out_path = cd_data.extracted_cmip6_path(
+            cmip6_variable,
+            cmip6_experiment,
+            cmip6_source,
+            member,
         )
+        if out_path.exists() and not overwrite:
+            print("Skipping", member, zstore_path)
+            continue
+
+        try:
+            print("Extracting", member, zstore_path)
+            cmip_data = load_cmip_data(zstore_path)
+
+            shell_tools.touch(out_path, exist_ok=True)
+            shift, scale = VARIABLE_ENCODINGS[cmip6_variable]
+            print("Writing to", out_path)
+            cmip_data.to_netcdf(
+                out_path,
+                encoding={
+                    cmip6_variable: {
+                        "dtype": "int16",
+                        "scale_factor": scale,
+                        "add_offset": shift,
+                        "_FillValue": -32767,
+                        "zlib": True,
+                        "complevel": 1,
+                    }
+                },
+            )
+        except Exception as e:
+            if out_path.exists():
+                out_path.unlink()
+            raise e
 
 
 @click.command()  # type: ignore[arg-type]
@@ -77,13 +95,15 @@ def extract_cmip6_main(
 @clio.with_cmip6_source()
 @clio.with_cmip6_experiment()
 @clio.with_cmip6_variable()
+@clio.with_overwrite()
 def extract_cmip6_task(
     output_dir: str,
     cmip6_source: str,
     cmip6_experiment: str,
     cmip6_variable: str,
+    overwrite: bool,
 ) -> None:
-    extract_cmip6_main(output_dir, cmip6_source, cmip6_experiment, cmip6_variable)
+    extract_cmip6_main(output_dir, cmip6_source, cmip6_experiment, cmip6_variable, overwrite)
 
 
 @click.command()  # type: ignore[arg-type]
@@ -92,6 +112,7 @@ def extract_cmip6_task(
 @clio.with_cmip6_experiment(allow_all=True)
 @clio.with_cmip6_variable(allow_all=True)
 @clio.with_queue()
+@clio.with_overwrite()
 def extract_cmip6(
     output_dir: str,
     cmip6_source: str,
diff --git a/src/climate_downscale/generate/cmip6_daily.py b/src/climate_downscale/generate/scenario_daily.py
similarity index 99%
rename from src/climate_downscale/generate/cmip6_daily.py
rename to src/climate_downscale/generate/scenario_daily.py
index aa148a2..283ff77 100644
--- a/src/climate_downscale/generate/cmip6_daily.py
+++ b/src/climate_downscale/generate/scenario_daily.py
@@ -7,6 +7,11 @@
 from climate_downscale.data import ClimateDownscaleData
 from climate_downscale.generate import utils
 
+TRANSFORM_MAP = {
+    "tas": (utils.kelvin_to_celsius, "additive"),
+    "pr": (utils.precipitation_flux_to_rainfall, "multiplicative"),
+}
+
 
 def compute_anomaly(
     reference: xr.DataArray, target: xr.DataArray, anomaly_type: str
@@ -29,10 +34,7 @@ def compute_anomaly(
     return anomaly
 
 
-TRANSFORM_MAP = {
-    "tas": (utils.kelvin_to_celsius, "additive"),
-    "pr": (utils.precipitation_flux_to_rainfall, "multiplicative"),
-}
+
 
 
 def load_reference_and_target(

From fd58f3d60f0b0f22e57df28dd263f481434dbbc1 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Sat, 15 Jun 2024 18:44:40 -0700
Subject: [PATCH 46/71] Add tasmin/tasmax, overwrite option, and some
 robustness. Fix bugs.

---
 src/climate_downscale/data.py                 |   2 +-
 .../generate/historical_daily.py              |  42 ++--
 .../generate/historical_reference.py          |  18 +-
 .../generate/scenario_daily.py                | 121 +++++-----
 src/climate_downscale/generate/utils.py       |  10 +-
 src/climate_downscale/old_climate/__init__.py |   0
 src/climate_downscale/old_climate/data.py     | 219 ------------------
 .../old_climate/project_anomaly.py            | 127 ----------
 .../old_climate/project_climate.py            | 172 --------------
 9 files changed, 104 insertions(+), 607 deletions(-)
 delete mode 100644 src/climate_downscale/old_climate/__init__.py
 delete mode 100644 src/climate_downscale/old_climate/data.py
 delete mode 100644 src/climate_downscale/old_climate/project_anomaly.py
 delete mode 100644 src/climate_downscale/old_climate/project_climate.py

diff --git a/src/climate_downscale/data.py b/src/climate_downscale/data.py
index 6af375b..f27355d 100644
--- a/src/climate_downscale/data.py
+++ b/src/climate_downscale/data.py
@@ -134,7 +134,7 @@ def save_daily_results(
         variable: str,
         year: int | str,
         encoding_kwargs: dict[str, Any],
-    ):
+    ) -> None:
         path = self.daily_results_path(scenario, variable, year)
         mkdir(path.parent, exist_ok=True, parents=True)
         touch(path, exist_ok=True)
diff --git a/src/climate_downscale/generate/historical_daily.py b/src/climate_downscale/generate/historical_daily.py
index 979312a..20012fa 100644
--- a/src/climate_downscale/generate/historical_daily.py
+++ b/src/climate_downscale/generate/historical_daily.py
@@ -122,24 +122,31 @@ def load_and_shift_longitude(ds_path: str | Path) -> xr.Dataset:
 
 
 def load_variable(
+    cd_data: ClimateDownscaleData,
     variable: str,
     year: str,
     month: str,
     dataset: str = "single-levels",
 ) -> xr.Dataset:
-    root = Path("/mnt/share/erf/climate_downscale/extracted_data/era5")
-    p = root / f"reanalysis-era5-{dataset}_{variable}_{year}_{month}.nc"
-    if dataset == "land" and not p.exists():
-        raise NotImplementedError
-        # Substitute the single level dataset pre-interpolated at the target resolution.
-        p = root / f"reanalysis-era5-single-levels_{variable}_{year}_{month}.nc"
-        ds = utils.interpolate_to_target_latlon(load_and_shift_longitude(p))
+    path = cd_data.extracted_era5_path(dataset, variable, year, month)
+    if dataset == "land" and not path.exists():
+        if variable != "total_sky_direct_solar_radiation_at_surface":
+            # We only fallback for the one dataset, otherwise extraction failed.
+            msg = f"Land dataset not found for {variable}. Extraction likely failed."
+            raise ValueError(msg)
+        # If the land dataset doesn't exist, fall back to the single-levels dataset
+        path = cd_data.extracted_era5_path("single-levels", variable, year, month)
+        ds = load_and_shift_longitude(path)
+        # We expect this to already be in the correct grid, so interpolate.
+        ds = utils.interpolate_to_target_latlon(ds)
     elif dataset == "land":
-        ds = load_and_shift_longitude(p).assign_coords(
-            latitude=utils.TARGET_LAT, longitude=utils.TARGET_LON
-        )
+        ds = load_and_shift_longitude(path)
+        # There are some slight numerical differences in the lat/long for some of
+        # the land datasets. They are gridded consistently, so just tweak the
+        # coordinates so things align.
+        ds = ds.assign_coords(latitude=utils.TARGET_LAT, longitude=utils.TARGET_LON)
     else:
-        ds = load_and_shift_longitude(p)
+        ds = load_and_shift_longitude(path)
     conversion = CONVERT_MAP[variable]
     ds = conversion(utils.rename_val_column(ds))
     return ds
@@ -150,14 +157,15 @@ def generate_historical_daily_main(
     year: str,
     target_variable: str,
 ) -> None:
-    source_variables, collapse_fun, (e_offset, e_scale) = TRANSFORM_MAP[target_variable]
+    cd_data = ClimateDownscaleData(output_dir)
 
+    source_variables, collapse_fun, (e_offset, e_scale) = TRANSFORM_MAP[target_variable]
     datasets = []
     for month in range(1, 13):
         month_str = f"{month:02d}"
         print(f"loading single-levels for {month_str}")
         single_level = [
-            load_variable(sv, year, month_str, "single-levels")
+            load_variable(cd_data, sv, year, month_str, "single-levels")
             for sv in source_variables
         ]
         print("collapsing")
@@ -169,7 +177,10 @@ def generate_historical_daily_main(
         ds_land_res = utils.interpolate_to_target_latlon(ds)
 
         print(f"loading land for {month_str}")
-        land = [load_variable(sv, year, month_str, "land") for sv in source_variables]
+        land = [
+            load_variable(cd_data, sv, year, month_str, "land")
+            for sv in source_variables
+        ]
         print("collapsing")
         with dask.config.set(**{"array.slicing.split_large_chunks": False}):
             ds_land = collapse_fun(*land).compute()  # type: ignore[operator]
@@ -181,7 +192,6 @@ def generate_historical_daily_main(
 
     ds_year = xr.concat(datasets, dim="date").sortby("date")
 
-    cd_data = ClimateDownscaleData(output_dir)
     cd_data.save_daily_results(
         ds_year,
         scenario="historical",
@@ -217,7 +227,7 @@ def generate_historical_daily(
     year: str,
     target_variable: str,
     queue: str,
-    overwrite: bool,
+    overwrite: bool,  # noqa: FBT001
 ) -> None:
     cd_data = ClimateDownscaleData(output_dir)
 
diff --git a/src/climate_downscale/generate/historical_reference.py b/src/climate_downscale/generate/historical_reference.py
index 6fee78d..e68213c 100644
--- a/src/climate_downscale/generate/historical_reference.py
+++ b/src/climate_downscale/generate/historical_reference.py
@@ -20,15 +20,29 @@ def generate_historical_reference_main(
         cd_data.daily_results_path("historical", target_variable, year)
         for year in utils.REFERENCE_YEARS
     ]
+    print(f"Building reference data from: {len(paths)} files.")
 
     reference_data = []
     for path in paths:
-        ds = xr.load_dataset(path).groupby("time.month").mean("time")
+        print(f"Loading: {path}")
+        ds = xr.load_dataset(path)
+        print("Computing monthly means")
+        ds = ds.groupby("time.month").mean("time")
         reference_data.append(ds)
 
-    encoding_kwargs = xr.open_dataset(paths[0])["value"].encoding
+    old_encoding = {
+        k: v for k, v in xr.open_dataset(paths[0])["value"].encoding.items()
+        if k in ['dtype', '_FillValue', 'scale_factor', 'add_offset']
+    }
+    encoding_kwargs = {
+        "zlib": True,
+        "complevel": 1,
+        **old_encoding,
+    }
 
+    print("Averaging years by month")
     reference = sum(reference_data) / len(reference_data)
+    print("Saving reference data")
     cd_data.save_daily_results(
         reference,
         scenario="historical",
diff --git a/src/climate_downscale/generate/scenario_daily.py b/src/climate_downscale/generate/scenario_daily.py
index 283ff77..8bfad67 100644
--- a/src/climate_downscale/generate/scenario_daily.py
+++ b/src/climate_downscale/generate/scenario_daily.py
@@ -7,14 +7,51 @@
 from climate_downscale.data import ClimateDownscaleData
 from climate_downscale.generate import utils
 
-TRANSFORM_MAP = {
-    "tas": (utils.kelvin_to_celsius, "additive"),
-    "pr": (utils.precipitation_flux_to_rainfall, "multiplicative"),
+
+# Map from source variable to a unit conversion function
+CONVERT_MAP = {
+    "tas": utils.kelvin_to_celsius,
+    "pr": utils.precipitation_flux_to_rainfall,
 }
 
 
+def load_and_shift_longitude(
+    ds_path: str | Path,
+    time_slice: slice,
+) -> xr.Dataset:
+    ds = xr.open_dataset(ds_path).sel(time=time_slice).compute()
+    ds = (
+        ds
+        .rename({"lat": "latitude", "lon": "longitude", "time": "date"})
+        .assign_coords(longitude=(ds.longitude + 180) % 360 - 180)
+        .sortby("longitude")
+    )
+    return ds
+
+
+def load_variable(
+    member_path: str | Path,
+    variable: str,
+    year: str,
+) -> xr.Dataset:
+    if year == "reference":
+        ds = load_and_shift_longitude(member_path, utils.REFERENCE_PERIOD)
+        ds = ds.groupby("date.month").mean("date")
+    else:
+        time_slice = slice(f"{year}-01-01", f"{year}-12-31")
+        time_range = pd.date_range(f"{year}-01-01", f"{year}-12-31")
+        ds = load_and_shift_longitude(member_path, time_slice)
+        ds = (
+            ds.assign_coords(date=ds.date.dt.floor("D"))
+            .interp(date=time_range)
+            .interpolate_na(dim="date", method="nearest", fill_value="extrapolate")
+        )
+    conversion = CONVERT_MAP[variable]
+    ds = conversion(utils.rename_val_column(ds))
+    return ds
+
 def compute_anomaly(
-    reference: xr.DataArray, target: xr.DataArray, anomaly_type: str
+    reference: xr.Dataset, target: xr.Dataset, anomaly_type: str
 ) -> xr.Dataset:
     if anomaly_type == "additive":
         anomaly = target.groupby("time.month") - reference
@@ -33,73 +70,25 @@ def compute_anomaly(
     anomaly = utils.interpolate_to_target_latlon(anomaly)
     return anomaly
 
-
-
-
-
-def load_reference_and_target(
-    path: str | Path, year: str | int
-) -> tuple[xr.Dataset, xr.Dataset]:
-    reference = (
-        xr.open_dataset(path)
-        .sel(time=utils.REFERENCE_PERIOD)
-        .compute()  # Load the subset before computing the mean, otherwise it's slow
-        .groupby("time.month")
-        .mean("time")
-    )
-
-    time_slice = slice(f"{year}-01", f"{year}-12")
-    time_range = pd.date_range(f"{year}-01-01", f"{year}-12-31")
-    target = xr.open_dataset(path).sel(time=time_slice).compute()
-    target = (
-        target.assign_coords(time=target.time.dt.floor("D"))
-        .interp_calendar(time_range)
-        .interpolate_na(dim="time", method="nearest", fill_value="extrapolate")
-    )
-    return reference, target
-
-
-def generate_cmip6_daily_main(
+def generate_scenario_daily_main(
     output_dir: str | Path,
     year: str | int,
     target_variable: str,
     cmip_scenario: str,
-    rerefk,
 ) -> None:
     cd_data = ClimateDownscaleData(output_dir)
-    paths = cd_data.cmip6.glob(f"{target_variable}_{cmip_scenario}*.nc")
-
-
-def compute_anomaly(path, year):
-    reference_period = slice("2015-01-01", "2024-12-31")
-
-    anomaly = target.groupby("time.month") - ref
-    anomaly = anomaly.rename({"lat": "latitude", "lon": "longitude"})
-    anomaly = anomaly.assign_coords(
-        longitude=(anomaly.longitude + 180) % 360 - 180
-    ).sortby("longitude")
-    anomaly = utils.interpolate_to_target_latlon(
-        anomaly, target_lat=TARGET_LAT, target_lon=TARGET_LON
-    )
-
-    return anomaly
-
-
-variable = "tas"
-scenario = "ssp119"
-year = "2024"
-
-paths = sorted(
-    list(
-        Path("/mnt/share/erf/climate_downscale/extracted_data/cmip6").glob(
-            "tas_ssp119*.nc"
+    paths = cd_data.extracted_cmip6.glob(f"{target_variable}_{cmip_scenario}*.nc")
+
+    for path in paths:
+        reference = load_variable(path, target_variable, "reference")
+        target = load_variable(path, target_variable, year)
+
+        anomaly_type = TRANSFORM_MAP[target_variable][1]
+        anomaly = compute_anomaly(reference, target, anomaly_type)
+        cd_data.save_daily_results(
+            anomaly,
+            scenario=cmip_scenario,
+            variable=target_variable,
+            year=year,
         )
-    )
-)
-p = paths[0]
-
-
-a = 1 / len(paths) * compute_anomaly(paths[0], year)
 
-for p in tqdm.tqdm(paths[1:]):
-    a += 1 / len(paths) * compute_anomaly(p, year)
diff --git a/src/climate_downscale/generate/utils.py b/src/climate_downscale/generate/utils.py
index f87d14a..366db11 100644
--- a/src/climate_downscale/generate/utils.py
+++ b/src/climate_downscale/generate/utils.py
@@ -62,7 +62,7 @@ def precipitation_flux_to_rainfall(precipitation_flux: xr.Dataset) -> xr.Dataset
     """
     seconds_per_day = 86400
     mm_per_kg_m2 = 1
-    return seconds_per_day * mm_per_kg_m2 * precipitation_flux  # type: ignore[no-any-return]k
+    return seconds_per_day * mm_per_kg_m2 * precipitation_flux  # type: ignore[no-any-return]
 
 
 def scale_wind_speed_height(wind_speed_10m: xr.Dataset) -> xr.Dataset:
@@ -289,6 +289,8 @@ def rename_val_column(ds: xr.Dataset) -> xr.Dataset:
 def interpolate_to_target_latlon(
     ds: xr.Dataset,
 ) -> xr.Dataset:
-    return ds.interp(
-        longitude=TARGET_LON, latitude=TARGET_LAT, method="nearest"
-    ).interpolate_na(dim="longitude", method="nearest", fill_value="extrapolate")
+    return (
+        ds.interp(longitude=TARGET_LON, latitude=TARGET_LAT, method="nearest")
+        .interpolate_na(dim="longitude", method="nearest", fill_value="extrapolate")
+        .interpolate_na(dim="latitude", method="nearest", fill_value="extrapolate")
+    )
diff --git a/src/climate_downscale/old_climate/__init__.py b/src/climate_downscale/old_climate/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/climate_downscale/old_climate/data.py b/src/climate_downscale/old_climate/data.py
deleted file mode 100644
index b5422d2..0000000
--- a/src/climate_downscale/old_climate/data.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import itertools
-from collections.abc import Callable
-
-import gcsfs
-import pandas as pd
-import xarray as xr
-
-
-def load_cmip_metadata(
-    tables: tuple[str, ...] = ("Amon", "day"),
-    variables: tuple[str, ...] = ("tas", "pr"),
-    experiments: tuple[str, ...] = (
-        "historical",
-        "ssp126",
-        "ssp245",
-        "ssp370",
-        "ssp585",
-    ),
-) -> pd.DataFrame:
-    """Loads CMIP6 metadata for the given tables, variables, and experiments.
-
-    Parameters
-    ----------
-    tables
-        The tables to include.
-    variables
-        The variables to include.
-    experiments
-        The experiments to include.
-
-    Returns
-    -------
-    pd.DataFrame
-        CMIP6 metadata containing only the institutions and sources with all
-        tables, variables, and experiments.
-    """
-    all_models = load_raw_cmip_metadata()
-    models_and_params = filter_institutions_and_sources(
-        all_models,
-        tables,
-        variables,
-        experiments,
-    )
-
-    # There should be no duplicates here, but there are. I'm not going to investigate
-    # why, but I'm just going to drop them.
-    member_count = models_and_params.groupby(
-        ["institution_id", "source_id", "member_id"]
-    )["activity_id"].count()
-    expected_count = len(tables) * len(variables) * len(experiments)
-    member_mask = member_count == expected_count
-
-    final_models = (
-        models_and_params.set_index(["institution_id", "source_id", "member_id"])
-        .loc[member_mask[member_mask].index]
-        .reset_index()
-    )
-
-    # Filter to the models we need for the anomaly analysis.
-    monthly_historical = (final_models["table_id"] == "Amon") & (
-        final_models["experiment_id"] == "historical"
-    )
-    daily_scenario = (final_models["table_id"] == "day") & (
-        final_models["experiment_id"] != "historical"
-    )
-    return final_models.loc[monthly_historical | daily_scenario]
-
-
-def load_cmip_historical_data(path: str) -> xr.Dataset:
-    """Loads a CMIP6 historical dataset from a zarr path.
-
-    Parameters
-    ----------
-    path
-        The path to the zarr store.
-
-    Returns
-    -------
-    xr.Dataset
-        The CMIP6 historical dataset.
-    """
-    reference_period = slice("1981-01-15", "2010-12-15")
-    return (
-        load_cmip_data(path)
-        .sel(time=reference_period)
-        .groupby("time.month")
-        .mean("time")
-    )
-
-
-def load_cmip_experiment_data(path: str, year: str) -> xr.Dataset:
-    """Loads a CMIP6 experiment dataset from a zarr path by day for a given year.
-
-    Parameters
-    ----------
-    path
-        The path to the zarr store.
-    year
-        The year to load.
-
-    Returns
-    -------
-    xr.Dataset
-        The CMIP6 experiment dataset for the given year.
-    """ ""
-    time_slice = slice(f"{year}-01", f"{year}-12")
-    time_range = pd.date_range(f"{year}-01-01", f"{year}-12-31")
-    return load_cmip_data(path).sel(time=time_slice).interp_calendar(time_range)
-
-
-##################
-# Helper methods #
-##################
-
-
-def load_raw_cmip_metadata() -> pd.DataFrame:
-    """Loads metadata containing information about all CMIP6 models."""
-    path = "https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv"
-    return pd.read_csv(path)
-
-
-def load_cmip_data(zarr_path: str) -> xr.Dataset:
-    """Loads a CMIP6 dataset from a zarr path."""
-    gcs = gcsfs.GCSFileSystem(token="anon")  # noqa: S106
-    mapper = gcs.get_mapper(zarr_path)
-    ds = xr.open_zarr(mapper, consolidated=True)
-    lon = (ds.lon + 180) % 360 - 180
-    ds = ds.assign_coords(lon=lon).sortby("lon")
-    ds = ds.drop(
-        ["lat_bnds", "lon_bnds", "time_bnds", "height", "time_bounds", "bnds"],
-        errors="ignore",
-    )
-    return ds  # type: ignore[no-any-return]
-
-
-def contains_combo(
-    table: str,
-    variable: str,
-    experiment: str,
-) -> Callable[[pd.DataFrame], bool]:
-    """Get a function to check if a dataset contains a given cmip metadata combination.
-
-    Parameters
-    ----------
-    table
-        The table to check for.
-    variable
-        The variable to check for.
-    experiment
-        The experiment to check for.
-
-    Returns
-    -------
-    Callable[[pd.DataFrame], bool]
-        A function that checks if a dataset contains a given cmip metadata combination.
-    """
-
-    def _check(df: pd.DataFrame) -> bool:
-        return (
-            df["table_id"].eq(table)
-            & df["variable_id"].eq(variable)
-            & df["experiment_id"].eq(experiment)
-        ).any()
-
-    return _check
-
-
-def filter_institutions_and_sources(
-    cmip_meta: pd.DataFrame,
-    tables: tuple[str, ...],
-    variables: tuple[str, ...],
-    experiments: tuple[str, ...],
-) -> pd.DataFrame:
-    """Filters a cmip metadata dataframe to only include models that have all
-    combinations of the given tables, variables, and experiments.
-    Parameters
-    ----------
-    cmip_meta
-        CMIP metadata dataframe.
-    tables
-        The tables to include.
-    variables
-        The variables to include.
-    experiments
-        The experiments to include.
-    Returns
-    -------
-    pd.DataFrame
-        Filtered cmip metadata containing only the institutions and sources with all
-        tables, variables, and experiments.
-    """
-    # First we filter down to all models from the institutions and sources that have
-    # all the combinations of tables, variables, and experiments.
-    masks = []
-    for table, variable, experiment in itertools.product(
-        tables, variables, experiments
-    ):
-        has_combo = cmip_meta.groupby(["institution_id", "source_id"]).apply(
-            contains_combo(table, variable, experiment)
-        )
-        masks.append(has_combo)
-    mask = pd.concat(masks, axis=1).all(axis=1)
-
-    institutions_and_sources = mask[mask].index
-    models_with_all_params = (
-        cmip_meta.set_index(["institution_id", "source_id"])
-        .loc[institutions_and_sources]
-        .reset_index()
-    )
-
-    # Now we filter down to the specific subset of table/variable/experiment
-    # combinations within the institutions and sources.
-    param_mask = (
-        models_with_all_params["table_id"].isin(tables)
-        & models_with_all_params["variable_id"].isin(variables)
-        & models_with_all_params["experiment_id"].isin(experiments)
-    )
-    models_and_params = models_with_all_params[param_mask]
-    return models_and_params
diff --git a/src/climate_downscale/old_climate/project_anomaly.py b/src/climate_downscale/old_climate/project_anomaly.py
deleted file mode 100644
index c35ef40..0000000
--- a/src/climate_downscale/old_climate/project_anomaly.py
+++ /dev/null
@@ -1,127 +0,0 @@
-from __future__ import annotations
-
-from pathlib import Path
-from typing import TYPE_CHECKING
-
-import click
-import pandas as pd
-from rra_population_pipelines.pipelines.climate import data
-from rra_population_pipelines.shared.cli_tools import options as clio
-from rra_population_pipelines.shared.data import RRA_POP
-from rra_tools import jobmon
-
-if TYPE_CHECKING:
-    import xarray as xr
-
-_ENSEMBLE_MEMBERS = [
-    ("NCAR", "CESM2"),
-    ("MOHC", "UKESM1-0-LL"),
-    ("IPSL", "IPSL-CM6A-LR"),
-    ("MPI-M", "MPI-ESM1-2-LR"),
-    ("MIROC", "MIROC6"),
-    ("NOAA-GFDL", "GFDL-ESM4"),
-]
-
-_VALID_YEARS = tuple([str(y) for y in range(2015, 2101)])
-
-
-def compute_common_lat_lon(
-    run_metadata: pd.DataFrame,
-) -> tuple[pd.Index[float], pd.Index[float]]:
-    lat = pd.Index([], name="lat", dtype=float)
-    lon = pd.Index([], name="lon", dtype=float)
-
-    for key in run_metadata.index.tolist():
-        historical = data.load_cmip_historical_data(run_metadata.loc[key, "historical"])
-        lat = lat.union(historical["lat"])  # type: ignore[arg-type]
-        lon = lon.union(historical["lon"])  # type: ignore[arg-type]
-    return lat, lon
-
-
-def compute_single_model_anomaly(
-    historical: xr.Dataset,
-    experiment: xr.Dataset,
-    variable: str,
-) -> xr.Dataset:
-    if variable == "tas":
-        anomaly = experiment.groupby("time.month") - historical
-    else:
-        historical = 86400 * historical + 1
-        experiment = 86400 * experiment + 1
-        anomaly = (1 / historical) * experiment.groupby("time.month")
-    return anomaly
-
-
-def interp_common_lat_lon(
-    ds: xr.Dataset, lat: pd.Index[float], lon: pd.Index[float]
-) -> xr.Dataset:
-    return (
-        ds.pad(lon=1, mode="wrap")
-        .assign_coords(lon=ds.lon.pad(lon=1, mode="reflect", reflect_type="odd"))
-        .interp(lat=lat, lon=lon)
-    )
-
-
-def project_anomaly_main(variable: str, experiment: str, year: str) -> xr.Dataset:
-    run_meta = get_run_metadata(variable, experiment)
-    lat, lon = compute_common_lat_lon(run_meta)
-
-    anomalies: list[xr.Dataset] = []
-    for key in run_meta.index.tolist():
-        historical = data.load_cmip_historical_data(run_meta.loc[key, "historical"])
-        scenario = data.load_cmip_experiment_data(
-            run_meta.loc[key, "experiment"], year=year
-        )
-        anomaly = compute_single_model_anomaly(historical, scenario, variable=variable)
-        anomaly = interp_common_lat_lon(anomaly, lat, lon)
-        anomalies.append(anomaly)
-
-    mean_anomaly = 1 / len(anomalies) * sum(anomalies)
-    return mean_anomaly  # type: ignore[return-value]
-
-
-@click.command()  # type: ignore[arg-type]
-@click.option(
-    "--variable",
-    type=click.Choice(["tas", "pr"]),
-)
-@clio.with_climate_scenario(allow_all=False)
-@clio.with_year(allow_all=False, choices=_VALID_YEARS)
-@clio.with_output_directory(RRA_POP.projected_climate_anomaly_data)
-def project_anomaly_task(
-    variable: str,
-    climate_scenario: str,
-    year: str,
-    output_dir: str,
-) -> None:
-    projected_anomaly = project_anomaly_main(variable, climate_scenario, year)
-    out_path = Path(output_dir) / "{variable}_{experiment}_{year}.nc"
-    projected_anomaly.to_netcdf(out_path)
-
-
-@click.command()  # type:  ignore[arg-type]
-@clio.with_output_directory(RRA_POP.projected_climate_anomaly_data)
-@clio.with_queue()
-def project_anomaly(output_dir: str, queue: str) -> None:
-    jobmon.run_parallel(
-        task_name="project_anomaly",
-        node_args={
-            "variable": [
-                "tas",
-                "pr",
-            ],
-            "experiment": list(clio.VALID_CLIMATE_SCENARIOS),
-            "year": list(_VALID_YEARS),
-        },
-        task_args={
-            "output-dir": output_dir,
-        },
-        task_resources={
-            "queue": queue,
-            "cores": 2,
-            "memory": "70G",
-            "runtime": "120m",
-            "project": "proj_rapidresponse",
-        },
-        runner="rptask",
-    )
diff --git a/src/climate_downscale/old_climate/project_climate.py b/src/climate_downscale/old_climate/project_climate.py
deleted file mode 100644
index 1b366cb..0000000
--- a/src/climate_downscale/old_climate/project_climate.py
+++ /dev/null
@@ -1,172 +0,0 @@
-import click
-import pandas as pd
-import xarray as xr
-from rra_population_pipelines.shared.cli_tools import options as clio
-from rra_population_pipelines.shared.data import (
-    RRA_DATA_ROOT,
-    RRA_POP,
-    RRAPopulationData,
-)
-from rra_tools import jobmon
-
-
-def get_chelsa(variable: str, lat: slice, lon: slice) -> xr.Dataset:
-    ds_paths = [
-        RRA_POP.get_downscaled_reference_map_path(variable, month)
-        for month in range(1, 13)
-    ]
-    ds = (
-        xr.open_mfdataset(
-            ds_paths,
-            chunks={"lat": -1, "lon": -1},
-            concat_dim=[pd.Index(range(1, 13), name="month")],  # type: ignore[arg-type]
-            combine="nested",
-        )
-        .sel(lat=lat, lon=lon)
-        .rename({"Band1": variable})
-        .drop_vars("crs")
-    )
-    if variable == "tas":  # noqa: SIM108
-        ds = 0.1 * ds - 273.15
-    else:
-        ds = 0.1 * ds
-    return ds
-
-
-def load_and_downscale_anomaly(
-    variable: str,
-    scenario: str,
-    year: int,
-    lat: xr.DataArray,
-    lon: xr.DataArray,
-) -> xr.Dataset:
-    in_root = (
-        RRA_POP.human_niche_data
-        / "chelsa-downscaled-projections"
-        / "_anomalies"
-        / "GLOBAL"
-    )
-    path = in_root / f"{variable}_{scenario}_{year}.nc"
-    ds = xr.open_dataset(
-        path,
-        # Load the whole thing, but use a dask array
-        chunks={"lat": -1, "lon": -1, "time": -1},
-    ).interp(lat=lat, lon=lon)
-    return ds
-
-
-def apply_anomaly(data: xr.Dataset, anomaly: xr.Dataset) -> xr.Dataset:
-    if "tas" in anomaly.keys():  # noqa: SIM118
-        result = anomaly.groupby("time.month") + data
-    else:
-        result = anomaly.groupby("time.month") * data * (1 / 30)
-    return result
-
-
-def compute_measure(data: xr.Dataset, measure: str) -> xr.Dataset:
-    if measure == "temperature":
-        result = data.mean("time")
-    elif measure == "precipitation":
-        result = data.sum("time")
-    else:
-        threshold = 30
-        result = (data > threshold).sum("time")
-    return result
-
-
-def project_climate_main(
-    iso3: str,
-    measure: str,
-    scenario: str,
-    pop_data_dir: str,
-) -> None:
-    pop_data = RRAPopulationData(pop_data_dir)
-    admin0 = pop_data.load_shapefile(
-        admin_level=0,
-        iso3=iso3,
-        year=2022,
-    )
-    minx, miny, maxx, maxy = admin0.total_bounds
-    lat, lon = slice(miny, maxy), slice(minx, maxx)
-
-    variable = {
-        "temperature": "tas",
-        "precipitation": "pr",
-        "days_over_thirty": "tas",
-    }[measure]
-
-    print("Working on", scenario, measure)
-    ds = get_chelsa(variable, lat, lon)
-
-    results = []
-    for year in range(2015, 2101):
-        anom = load_and_downscale_anomaly(
-            variable, scenario, year, ds["lat"], ds["lon"]
-        )
-        result = apply_anomaly(ds, anom)
-        result = compute_measure(result, measure)
-        results.append(result)
-    result = xr.concat(results, dim=pd.Index(range(2015, 2101), name="year"))
-
-    print("Writing results")
-    pop_data.save_climate_data(
-        result,
-        measure=measure,
-        iso3=iso3,
-        scenario=scenario,
-    )
-
-
-@click.command()  # type: ignore[arg-type]
-@clio.with_iso3(allow_all=False)
-@click.option(
-    "--measure",
-    type=click.Choice(["temperature", "precipitation", "days_over_thirty"]),
-)
-@clio.with_climate_scenario(allow_all=False)
-@clio.with_input_directory("pop-data", RRA_DATA_ROOT)
-def project_climate_task(
-    iso3: str,
-    measure: str,
-    climate_scenario: str,
-    pop_data_dir: str,
-) -> None:
-    project_climate_main(iso3, measure, climate_scenario, pop_data_dir)
-
-
-@click.command()  # type: ignore[arg-type]
-@clio.with_iso3(allow_all=False)
-@clio.with_input_directory("pop-data", RRA_DATA_ROOT)
-@clio.with_queue()
-def project_climate(
-    iso3: str,
-    pop_data_dir: str,
-    queue: str,
-) -> None:
-    pop_data = RRAPopulationData(pop_data_dir)
-    jobmon.run_parallel(
-        task_name="project_climate",
-        node_args={
-            "iso3": [
-                iso3,
-            ],
-            "measure": [
-                "temperature",
-                "precipitation",
-                "days_over_thirty",
-            ],
-            "scenario": list(clio.VALID_CLIMATE_SCENARIOS),
-        },
-        task_args={
-            "pop-data-dir": pop_data_dir,
-        },
-        task_resources={
-            "queue": queue,
-            "cores": 2,
-            "memory": "70G",
-            "runtime": "120m",
-            "project": "proj_rapidresponse",
-        },
-        runner="rptask",
-        log_root=pop_data.climate_data,
-    )

From 19a7dd499f36ddbdfb5737a1cf5b7bf23c5af037 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Sat, 15 Jun 2024 18:49:39 -0700
Subject: [PATCH 47/71] Formatting

---
 .../downscale/prepare_training_data.py            |  4 ++--
 .../generate/historical_daily.py                  |  4 ++--
 .../generate/historical_reference.py              | 11 ++++++-----
 src/climate_downscale/generate/scenario_daily.py  | 15 +++++++--------
 src/climate_downscale/generate/utils.py           |  2 +-
 5 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/climate_downscale/downscale/prepare_training_data.py b/src/climate_downscale/downscale/prepare_training_data.py
index 807b0c3..570bb4e 100644
--- a/src/climate_downscale/downscale/prepare_training_data.py
+++ b/src/climate_downscale/downscale/prepare_training_data.py
@@ -48,7 +48,7 @@ def get_era5_temperature(
     lon = xr.DataArray(coords["lon"], dims=["points"])
     time = xr.DataArray(coords["date"], dims=["points"])
 
-    era5 = cd_data.load_era5_temperature_daily_mean(year)
+    era5 = cd_data.load_daily_results("historical", "tas", year)
     era5 = (
         era5.assign_coords(longitude=(((era5.longitude + 180) % 360) - 180))
         .sortby(["latitude", "longitude"])
@@ -59,7 +59,7 @@ def get_era5_temperature(
         # expver == 1 is final data.  expver == 5 is provisional data
         # and has a very strong nonsense seasonal trend.
         era5 = era5.sel(expver=1)
-    return era5["t2m"].to_numpy() - 273.15
+    return era5["value"].to_numpy()
 
 
 def prepare_training_data_main(output_dir: str | Path, year: str) -> None:
diff --git a/src/climate_downscale/generate/historical_daily.py b/src/climate_downscale/generate/historical_daily.py
index 20012fa..412026b 100644
--- a/src/climate_downscale/generate/historical_daily.py
+++ b/src/climate_downscale/generate/historical_daily.py
@@ -114,7 +114,7 @@ def with_target_variable(
 
 def load_and_shift_longitude(ds_path: str | Path) -> xr.Dataset:
     ds = xr.open_dataset(ds_path).chunk(time=24)
-    with dask.config.set(**{"array.slicing.split_large_chunks": False}):
+    with dask.config.set(**{"array.slicing.split_large_chunks": False}):  # type: ignore[arg-type]
         ds = ds.assign_coords(longitude=(ds.longitude + 180) % 360 - 180).sortby(
             "longitude"
         )
@@ -182,7 +182,7 @@ def generate_historical_daily_main(
             for sv in source_variables
         ]
         print("collapsing")
-        with dask.config.set(**{"array.slicing.split_large_chunks": False}):
+        with dask.config.set(**{"array.slicing.split_large_chunks": False}):  # type: ignore[arg-type]
             ds_land = collapse_fun(*land).compute()  # type: ignore[operator]
         ds_land = ds_land.assign(date=pd.to_datetime(ds_land.date))
 
diff --git a/src/climate_downscale/generate/historical_reference.py b/src/climate_downscale/generate/historical_reference.py
index e68213c..8d349bb 100644
--- a/src/climate_downscale/generate/historical_reference.py
+++ b/src/climate_downscale/generate/historical_reference.py
@@ -31,8 +31,9 @@ def generate_historical_reference_main(
         reference_data.append(ds)
 
     old_encoding = {
-        k: v for k, v in xr.open_dataset(paths[0])["value"].encoding.items()
-        if k in ['dtype', '_FillValue', 'scale_factor', 'add_offset']
+        k: v
+        for k, v in xr.open_dataset(paths[0])["value"].encoding.items()
+        if k in ["dtype", "_FillValue", "scale_factor", "add_offset"]
     }
     encoding_kwargs = {
         "zlib": True,
@@ -44,7 +45,7 @@ def generate_historical_reference_main(
     reference = sum(reference_data) / len(reference_data)
     print("Saving reference data")
     cd_data.save_daily_results(
-        reference,
+        reference,  # type: ignore[arg-type]
         scenario="historical",
         variable=target_variable,
         year="reference",
@@ -52,7 +53,7 @@ def generate_historical_reference_main(
     )
 
 
-@click.command()
+@click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
 @with_target_variable()
 def generate_historical_reference_task(
@@ -62,7 +63,7 @@ def generate_historical_reference_task(
     generate_historical_reference_main(output_dir, target_variable)
 
 
-@click.command()
+@click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
 @with_target_variable(allow_all=True)
 @clio.with_queue()
diff --git a/src/climate_downscale/generate/scenario_daily.py b/src/climate_downscale/generate/scenario_daily.py
index 8bfad67..061ccfb 100644
--- a/src/climate_downscale/generate/scenario_daily.py
+++ b/src/climate_downscale/generate/scenario_daily.py
@@ -1,13 +1,11 @@
 from pathlib import Path
 
 import pandas as pd
-import tqdm
 import xarray as xr
 
 from climate_downscale.data import ClimateDownscaleData
 from climate_downscale.generate import utils
 
-
 # Map from source variable to a unit conversion function
 CONVERT_MAP = {
     "tas": utils.kelvin_to_celsius,
@@ -21,8 +19,7 @@ def load_and_shift_longitude(
 ) -> xr.Dataset:
     ds = xr.open_dataset(ds_path).sel(time=time_slice).compute()
     ds = (
-        ds
-        .rename({"lat": "latitude", "lon": "longitude", "time": "date"})
+        ds.rename({"lat": "latitude", "lon": "longitude", "time": "date"})
         .assign_coords(longitude=(ds.longitude + 180) % 360 - 180)
         .sortby("longitude")
     )
@@ -32,7 +29,7 @@ def load_and_shift_longitude(
 def load_variable(
     member_path: str | Path,
     variable: str,
-    year: str,
+    year: str | int,
 ) -> xr.Dataset:
     if year == "reference":
         ds = load_and_shift_longitude(member_path, utils.REFERENCE_PERIOD)
@@ -50,13 +47,14 @@ def load_variable(
     ds = conversion(utils.rename_val_column(ds))
     return ds
 
+
 def compute_anomaly(
     reference: xr.Dataset, target: xr.Dataset, anomaly_type: str
 ) -> xr.Dataset:
     if anomaly_type == "additive":
         anomaly = target.groupby("time.month") - reference
     elif anomaly_type == "multiplicative":
-        anomaly = (target.groupby("time.month") + 1) / (reference + 1)
+        anomaly = (target.groupby("time.month") + 1) / (reference + 1)  # type: ignore[operator]
     else:
         msg = f"Unknown anomaly type: {anomaly_type}"
         raise ValueError(msg)
@@ -70,6 +68,7 @@ def compute_anomaly(
     anomaly = utils.interpolate_to_target_latlon(anomaly)
     return anomaly
 
+
 def generate_scenario_daily_main(
     output_dir: str | Path,
     year: str | int,
@@ -83,12 +82,12 @@ def generate_scenario_daily_main(
         reference = load_variable(path, target_variable, "reference")
         target = load_variable(path, target_variable, year)
 
-        anomaly_type = TRANSFORM_MAP[target_variable][1]
+        anomaly_type = "additive"  # TRANSFORM_MAP[target_variable][1]
         anomaly = compute_anomaly(reference, target, anomaly_type)
         cd_data.save_daily_results(
             anomaly,
             scenario=cmip_scenario,
             variable=target_variable,
             year=year,
+            encoding_kwargs={"zlib": True, "complevel": 1},
         )
-
diff --git a/src/climate_downscale/generate/utils.py b/src/climate_downscale/generate/utils.py
index 366db11..eb48c91 100644
--- a/src/climate_downscale/generate/utils.py
+++ b/src/climate_downscale/generate/utils.py
@@ -62,7 +62,7 @@ def precipitation_flux_to_rainfall(precipitation_flux: xr.Dataset) -> xr.Dataset
     """
     seconds_per_day = 86400
     mm_per_kg_m2 = 1
-    return seconds_per_day * mm_per_kg_m2 * precipitation_flux  # type: ignore[no-any-return]
+    return seconds_per_day * mm_per_kg_m2 * precipitation_flux
 
 
 def scale_wind_speed_height(wind_speed_10m: xr.Dataset) -> xr.Dataset:

From 3f43234ecfd7b833f679972a7515cdcae7d5f50f Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Sat, 15 Jun 2024 18:50:22 -0700
Subject: [PATCH 48/71] typo

---
 src/climate_downscale/generate/historical_reference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/climate_downscale/generate/historical_reference.py b/src/climate_downscale/generate/historical_reference.py
index 6fee78d..760d015 100644
--- a/src/climate_downscale/generate/historical_reference.py
+++ b/src/climate_downscale/generate/historical_reference.py
@@ -23,7 +23,7 @@ def generate_historical_reference_main(
 
     reference_data = []
     for path in paths:
-        ds = xr.load_dataset(path).groupby("time.month").mean("time")
+        ds = xr.load_dataset(path).groupby("date.month").mean("date")
         reference_data.append(ds)
 
     encoding_kwargs = xr.open_dataset(paths[0])["value"].encoding

From d8ea998145826fee4f8a4ca180a1f9cdd84f346c Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Sat, 15 Jun 2024 19:25:55 -0700
Subject: [PATCH 49/71] thread through overwrite in extract cmip

---
 pyproject.toml                         |  2 ++
 src/climate_downscale/extract/cmip6.py | 11 +++++++----
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index cc5fee9..731596e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -98,6 +98,8 @@ ignore = [
     "PYI041",   # Use float instead of int | float; dumb rule
     "T201",     # print is fine for now.
     "RET504",   # Unnecessary assignment before return
+    "PLR0913",  # Too many arguments in function call, hard with CLIs.
+    "TRY201",   #
 ]
 
 [tool.ruff.lint.per-file-ignores]
diff --git a/src/climate_downscale/extract/cmip6.py b/src/climate_downscale/extract/cmip6.py
index e032658..3fe546f 100644
--- a/src/climate_downscale/extract/cmip6.py
+++ b/src/climate_downscale/extract/cmip6.py
@@ -1,4 +1,3 @@
-import itertools
 from pathlib import Path
 
 import click
@@ -37,7 +36,7 @@ def extract_cmip6_main(
     cmip6_source: str,
     cmip6_experiment: str,
     cmip6_variable: str,
-    overwrite: bool,
+    overwrite: bool,  # noqa: FBT001
 ) -> None:
     print(f"Checking metadata for {cmip6_source} {cmip6_experiment} {cmip6_variable}")
     cd_data = ClimateDownscaleData(output_dir)
@@ -101,9 +100,11 @@ def extract_cmip6_task(
     cmip6_source: str,
     cmip6_experiment: str,
     cmip6_variable: str,
-    overwrite: bool,
+    overwrite: bool,  # noqa: FBT001
 ) -> None:
-    extract_cmip6_main(output_dir, cmip6_source, cmip6_experiment, cmip6_variable, overwrite)
+    extract_cmip6_main(
+        output_dir, cmip6_source, cmip6_experiment, cmip6_variable, overwrite
+    )
 
 
 @click.command()  # type: ignore[arg-type]
@@ -119,6 +120,7 @@ def extract_cmip6(
     cmip6_experiment: str,
     cmip6_variable: str,
     queue: str,
+    overwrite: bool,  # noqa: FBT001
 ) -> None:
     sources = (
         clio.VALID_CMIP6_SOURCES if cmip6_source == clio.RUN_ALL else [cmip6_source]
@@ -144,6 +146,7 @@ def extract_cmip6(
         },
         task_args={
             "output-dir": output_dir,
+            "overwrite": overwrite,
         },
         task_resources={
             "queue": queue,

From e5bd086c3a7c0a5287f7b8b1f6a52892e44bb7f6 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Sun, 16 Jun 2024 00:20:03 -0700
Subject: [PATCH 50/71] Better cmip logging

---
 src/climate_downscale/extract/cmip6.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/climate_downscale/extract/cmip6.py b/src/climate_downscale/extract/cmip6.py
index 3fe546f..085637e 100644
--- a/src/climate_downscale/extract/cmip6.py
+++ b/src/climate_downscale/extract/cmip6.py
@@ -52,7 +52,8 @@ def extract_cmip6_main(
     meta_subset = meta[mask].set_index("member_id").zstore.to_dict()
     print(f"Extracting {len(meta_subset)} members...")
 
-    for member, zstore_path in meta_subset.items():
+    for i, (member, zstore_path) in enumerate(meta_subset.items()):
+        item = f"{i}/{len(meta_subset)} {member}"
         out_path = cd_data.extracted_cmip6_path(
             cmip6_variable,
             cmip6_experiment,
@@ -60,11 +61,11 @@ def extract_cmip6_main(
             member,
         )
         if out_path.exists() and not overwrite:
-            print("Skipping", member, zstore_path)
+            print("Skipping", item)
             continue
 
         try:
-            print("Extracting", member, zstore_path)
+            print("Extracting", item)
             cmip_data = load_cmip_data(zstore_path)
 
             shell_tools.touch(out_path, exist_ok=True)

From 8300d19c0ec25f2b775698fad8c3dec0d72eb1b6 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Sun, 16 Jun 2024 16:26:24 -0700
Subject: [PATCH 51/71] Delete some spurious historical variables, add runner
 for scenarios

---
 src/climate_downscale/generate/__init__.py    |   6 +
 .../generate/historical_daily.py              |  31 ---
 .../generate/scenario_daily.py                | 204 ++++++++++++++++--
 3 files changed, 192 insertions(+), 49 deletions(-)

diff --git a/src/climate_downscale/generate/__init__.py b/src/climate_downscale/generate/__init__.py
index 022426b..10b0563 100644
--- a/src/climate_downscale/generate/__init__.py
+++ b/src/climate_downscale/generate/__init__.py
@@ -6,13 +6,19 @@
     generate_historical_reference,
     generate_historical_reference_task,
 )
+from climate_downscale.generate.scenario_daily import (
+    generate_scenario_daily,
+    generate_scenario_daily_task,
+)
 
 RUNNERS = {
     "historical_daily": generate_historical_daily,
     "historical_reference": generate_historical_reference,
+    "scenario_daily": generate_scenario_daily,
 }
 
 TASK_RUNNERS = {
     "historical_daily": generate_historical_daily_task,
     "historical_reference": generate_historical_reference_task,
+    "scenario_daily": generate_scenario_daily_task,
 }
diff --git a/src/climate_downscale/generate/historical_daily.py b/src/climate_downscale/generate/historical_daily.py
index 107f440..d1bcb2d 100644
--- a/src/climate_downscale/generate/historical_daily.py
+++ b/src/climate_downscale/generate/historical_daily.py
@@ -47,11 +47,6 @@
         utils.daily_min,
         (273.15, 0.01),
     ),
-    "dewpoint_temperature": (
-        ["2m_dewpoint_temperature"],
-        utils.daily_mean,
-        (273.15, 0.01),
-    ),
     "wind_speed": (
         ["10m_u_component_of_wind", "10m_v_component_of_wind"],
         lambda x, y: utils.daily_mean(utils.vector_magnitude(x, y)),
@@ -69,32 +64,6 @@
     ),
 }
 
-ADDITIONAL_TRANSFORM_MAP = {
-    "heat_index": (
-        ["2m_temperature", "2m_dewpoint_temperature"],
-        lambda x, y: utils.daily_mean(utils.heat_index(x, y)),
-        (273.15, 0.01),
-    ),
-    "humidex": (
-        ["2m_temperature", "2m_dewpoint_temperature"],
-        lambda x, y: utils.daily_mean(utils.humidex(x, y)),
-        (273.15, 0.01),
-    ),
-    "effective_temperature": (
-        [
-            "2m_temperature",
-            "2m_dewpoint_temperature",
-            "10m_u_component_of_wind",
-            "10m_v_component_of_wind",
-        ],
-        lambda t2m, t2d, uas, vas: utils.daily_mean(
-            utils.effective_temperature(t2m, t2d, uas, vas)
-        ),
-        (273.15, 0.01),
-    ),
-}
-
-
 _P = typing.ParamSpec("_P")
 _T = typing.TypeVar("_T")
 
diff --git a/src/climate_downscale/generate/scenario_daily.py b/src/climate_downscale/generate/scenario_daily.py
index 061ccfb..6451d88 100644
--- a/src/climate_downscale/generate/scenario_daily.py
+++ b/src/climate_downscale/generate/scenario_daily.py
@@ -1,17 +1,90 @@
+import itertools
+import typing
 from pathlib import Path
 
+import click
 import pandas as pd
 import xarray as xr
+from rra_tools import jobmon
 
-from climate_downscale.data import ClimateDownscaleData
+from climate_downscale import cli_options as clio
+from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
 from climate_downscale.generate import utils
 
+VALID_YEARS = [str(y) for y in range(max(utils.REFERENCE_YEARS) + 1, 2101)]
+
 # Map from source variable to a unit conversion function
 CONVERT_MAP = {
+    "uas": utils.scale_wind_speed_height,
+    "vas": utils.scale_wind_speed_height,
+    "hurs": utils.identity,
     "tas": utils.kelvin_to_celsius,
+    "tasmin": utils.kelvin_to_celsius,
+    "tasmax": utils.kelvin_to_celsius,
     "pr": utils.precipitation_flux_to_rainfall,
 }
 
+# Map from target variable to:
+#  - a list of source variables
+#  - a transformation function
+#  - a tuple of offset and scale factors for the output for serialization
+#  - an anomaly type
+TRANSFORM_MAP = {
+    "mean_temperature": (
+        ["tas"],
+        utils.identity,
+        (273.15, 0.01),
+        "additive",
+    ),
+    "max_temperature": (
+        ["tasmax"],
+        utils.identity,
+        (273.15, 0.01),
+        "additive",
+    ),
+    "min_temperature": (
+        ["tasmin"],
+        utils.identity,
+        (273.15, 0.01),
+        "additive",
+    ),
+    "wind_speed": (
+        ["uas", "vas"],
+        utils.vector_magnitude,
+        (0, 0.01),
+        "multiplicative",
+    ),
+    "relative_humidity": (
+        ["hurs"],
+        utils.identity,
+        (0, 0.01),
+        "multiplicative",
+    ),
+    "total_precipitation": (
+        ["pr"],
+        utils.identity,
+        (0, 0.1),
+        "multiplicative",
+    ),
+}
+
+
+_P = typing.ParamSpec("_P")
+_T = typing.TypeVar("_T")
+
+
+def with_target_variable(
+    *,
+    allow_all: bool = False,
+) -> clio.ClickOption[_P, _T]:
+    return clio.with_choice(
+        "target-variable",
+        "t",
+        allow_all=allow_all,
+        choices=list(TRANSFORM_MAP.keys()),
+        help="Variable to generate.",
+    )
+
 
 def load_and_shift_longitude(
     ds_path: str | Path,
@@ -33,7 +106,6 @@ def load_variable(
 ) -> xr.Dataset:
     if year == "reference":
         ds = load_and_shift_longitude(member_path, utils.REFERENCE_PERIOD)
-        ds = ds.groupby("date.month").mean("date")
     else:
         time_slice = slice(f"{year}-01-01", f"{year}-12-31")
         time_range = pd.date_range(f"{year}-01-01", f"{year}-12-31")
@@ -65,7 +137,6 @@ def compute_anomaly(
         .assign_coords(longitude=(anomaly.longitude + 180) % 360 - 180)
         .sortby("longitude")
     )
-    anomaly = utils.interpolate_to_target_latlon(anomaly)
     return anomaly
 
 
@@ -73,21 +144,118 @@ def generate_scenario_daily_main(
     output_dir: str | Path,
     year: str | int,
     target_variable: str,
-    cmip_scenario: str,
+    cmip6_experiment: str,
 ) -> None:
     cd_data = ClimateDownscaleData(output_dir)
-    paths = cd_data.extracted_cmip6.glob(f"{target_variable}_{cmip_scenario}*.nc")
-
-    for path in paths:
-        reference = load_variable(path, target_variable, "reference")
-        target = load_variable(path, target_variable, year)
-
-        anomaly_type = "additive"  # TRANSFORM_MAP[target_variable][1]
-        anomaly = compute_anomaly(reference, target, anomaly_type)
-        cd_data.save_daily_results(
-            anomaly,
-            scenario=cmip_scenario,
-            variable=target_variable,
-            year=year,
-            encoding_kwargs={"zlib": True, "complevel": 1},
+
+    (source_variables, transform_fun, (e_offset, e_scale), anomaly_type) = (
+        TRANSFORM_MAP[target_variable]
+    )
+
+    paths_by_var = [
+        list(cd_data.extracted_cmip6.glob(f"{source_variable}_{cmip6_experiment}*.nc"))
+        for source_variable in source_variables
+    ]
+    source_paths = list(zip(*paths_by_var, strict=True))
+
+    historical_reference = cd_data.load_daily_results(
+        scenario="historical",
+        variable=target_variable,
+        year="reference",
+    )
+
+    scale = 1 / len(source_paths)
+    anomaly = xr.zeros_like(historical_reference)
+    for sps in source_paths:
+        scenario_reference = transform_fun(  # type: ignore[operator]
+            *[load_variable(sp, target_variable, "reference") for sp in sps]
         )
+        target = transform_fun(  # type: ignore[operator]
+            *[load_variable(sp, target_variable, year) for sp in sps]
+        )
+        s_anomaly = scale * compute_anomaly(scenario_reference, target, anomaly_type)
+        anomaly += utils.interpolate_to_target_latlon(s_anomaly)
+
+    scenario_data = historical_reference + anomaly
+    cd_data.save_daily_results(
+        scenario_data,
+        scenario=cmip6_experiment,
+        variable=target_variable,
+        year=year,
+        encoding_kwargs={
+            "add_offset": e_offset,
+            "scale_factor": e_scale,
+        },
+    )
+
+
+@click.command()  # type: ignore[arg-type]
+@clio.with_output_directory(DEFAULT_ROOT)
+@clio.with_year(years=VALID_YEARS)
+@with_target_variable()
+@clio.with_cmip6_experiment()
+def generate_scenario_daily_task(
+    output_dir: str, year: str, target_variable: str, cmip6_experiment: str
+) -> None:
+    generate_scenario_daily_main(output_dir, year, target_variable, cmip6_experiment)
+
+
+@click.command()  # type: ignore[arg-type]
+@clio.with_output_directory(DEFAULT_ROOT)
+@clio.with_year(years=VALID_YEARS, allow_all=True)
+@with_target_variable(allow_all=True)
+@clio.with_cmip6_experiment(allow_all=True)
+@clio.with_queue()
+@clio.with_overwrite()
+def generate_scenario_daily(
+    output_dir: str,
+    year: str,
+    target_variable: str,
+    cmip6_experiment: str,
+    queue: str,
+    overwrite: bool,  # noqa: FBT001
+) -> None:
+    cd_data = ClimateDownscaleData(output_dir)
+
+    years = VALID_YEARS if year == clio.RUN_ALL else [year]
+    variables = (
+        list(TRANSFORM_MAP.keys())
+        if target_variable == clio.RUN_ALL
+        else [target_variable]
+    )
+    experiments = (
+        list(clio.VALID_CMIP6_EXPERIMENTS)
+        if cmip6_experiment == clio.RUN_ALL
+        else [cmip6_experiment]
+    )
+
+    yve = []
+    complete = []
+    for y, v, e in itertools.product(years, variables, experiments):
+        path = cd_data.daily_results_path(y, v, e)
+        if not path.exists() or overwrite:
+            yve.append((y, v, e))
+        else:
+            complete.append((y, v, e))
+
+    print(f"{len(complete)} tasks already done. " f"Launching {len(yve)} tasks")
+
+    jobmon.run_parallel(
+        runner="cdtask",
+        task_name="generate scenario_daily",
+        flat_node_args=(
+            ("year", "target-variable", "cmip-experiment"),
+            yve,
+        ),
+        task_args={
+            "output-dir": output_dir,
+        },
+        task_resources={
+            "queue": queue,
+            "cores": 5,
+            "memory": "200G",
+            "runtime": "240m",
+            "project": "proj_rapidresponse",
+        },
+        max_attempts=1,
+    )

From 7a8edc9b8de552a2e8ae3bedc64d732dac911621 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Sun, 16 Jun 2024 16:28:19 -0700
Subject: [PATCH 52/71] Fix overwrite

---
 src/climate_downscale/extract/cmip6.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/climate_downscale/extract/cmip6.py b/src/climate_downscale/extract/cmip6.py
index 3fe546f..590f3ff 100644
--- a/src/climate_downscale/extract/cmip6.py
+++ b/src/climate_downscale/extract/cmip6.py
@@ -136,6 +136,8 @@ def extract_cmip6(
         else [cmip6_variable]
     )
 
+    overwrite_arg = {"overwrite": None} if overwrite else {}
+
     jobmon.run_parallel(
         runner="cdtask",
         task_name="extract cmip6",
@@ -146,7 +148,7 @@ def extract_cmip6(
         },
         task_args={
             "output-dir": output_dir,
-            "overwrite": overwrite,
+            **overwrite_arg,
         },
         task_resources={
             "queue": queue,

From 164debe3dc45f0c2e2517fafef51bd23f0dc62ed Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Sun, 16 Jun 2024 16:34:48 -0700
Subject: [PATCH 53/71] Add logging, linear interp for anomaly, and
 multiplicative anomaly application

---
 .../generate/scenario_daily.py                | 19 ++++++++++++++++---
 src/climate_downscale/generate/utils.py       |  3 ++-
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/climate_downscale/generate/scenario_daily.py b/src/climate_downscale/generate/scenario_daily.py
index 6451d88..6ea39db 100644
--- a/src/climate_downscale/generate/scenario_daily.py
+++ b/src/climate_downscale/generate/scenario_daily.py
@@ -158,25 +158,38 @@ def generate_scenario_daily_main(
     ]
     source_paths = list(zip(*paths_by_var, strict=True))
 
+    print("loading historical reference")
     historical_reference = cd_data.load_daily_results(
         scenario="historical",
         variable=target_variable,
         year="reference",
     )
 
+    print("Making memory buffer")
     scale = 1 / len(source_paths)
     anomaly = xr.zeros_like(historical_reference)
-    for sps in source_paths:
+    for i, sps in enumerate(source_paths):
+        pid = f"{i}/{len(source_paths)}"
+        print(f"{pid}: Loading reference")
         scenario_reference = transform_fun(  # type: ignore[operator]
             *[load_variable(sp, target_variable, "reference") for sp in sps]
         )
+        print(f"{pid}: Loading target")
         target = transform_fun(  # type: ignore[operator]
             *[load_variable(sp, target_variable, year) for sp in sps]
         )
+        print(f"{pid}: computing anomaly")
         s_anomaly = scale * compute_anomaly(scenario_reference, target, anomaly_type)
-        anomaly += utils.interpolate_to_target_latlon(s_anomaly)
+        print(f"{pid}: downscaling anomaly")
+        anomaly += utils.interpolate_to_target_latlon(s_anomaly, method="linear")
 
-    scenario_data = historical_reference + anomaly
+    print("Computing scenario data")
+    if anomaly_type == "additive":
+        scenario_data = historical_reference + anomaly
+    else:
+        scenario_data = historical_reference * anomaly
+
+    print("Saving")
     cd_data.save_daily_results(
         scenario_data,
         scenario=cmip6_experiment,
diff --git a/src/climate_downscale/generate/utils.py b/src/climate_downscale/generate/utils.py
index eb48c91..75fc0ad 100644
--- a/src/climate_downscale/generate/utils.py
+++ b/src/climate_downscale/generate/utils.py
@@ -288,9 +288,10 @@ def rename_val_column(ds: xr.Dataset) -> xr.Dataset:
 
 def interpolate_to_target_latlon(
     ds: xr.Dataset,
+    method: str = "nearest",
 ) -> xr.Dataset:
     return (
-        ds.interp(longitude=TARGET_LON, latitude=TARGET_LAT, method="nearest")
+        ds.interp(longitude=TARGET_LON, latitude=TARGET_LAT, method=method)  # type: ignore[arg-type]
         .interpolate_na(dim="longitude", method="nearest", fill_value="extrapolate")
         .interpolate_na(dim="latitude", method="nearest", fill_value="extrapolate")
     )

From 6770d88e046d39640f28c05ad1e1c49472dcdfa9 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Sun, 16 Jun 2024 16:36:59 -0700
Subject: [PATCH 54/71] Reorder load and shift longitude ops

---
 src/climate_downscale/generate/scenario_daily.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/climate_downscale/generate/scenario_daily.py b/src/climate_downscale/generate/scenario_daily.py
index 6ea39db..fc5f870 100644
--- a/src/climate_downscale/generate/scenario_daily.py
+++ b/src/climate_downscale/generate/scenario_daily.py
@@ -92,9 +92,9 @@ def load_and_shift_longitude(
 ) -> xr.Dataset:
     ds = xr.open_dataset(ds_path).sel(time=time_slice).compute()
     ds = (
-        ds.rename({"lat": "latitude", "lon": "longitude", "time": "date"})
-        .assign_coords(longitude=(ds.longitude + 180) % 360 - 180)
-        .sortby("longitude")
+        ds.assign_coords(lon=(ds.lon + 180) % 360 - 180)
+        .sortby("lon")
+        .rename({"lat": "latitude", "lon": "longitude", "time": "date"})
     )
     return ds
 

From 892d1ed35a34e2df976eff146abeb0f65e7bfdd1 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Sun, 16 Jun 2024 16:39:45 -0700
Subject: [PATCH 55/71] Infer variable from dataset

---
 src/climate_downscale/generate/scenario_daily.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/climate_downscale/generate/scenario_daily.py b/src/climate_downscale/generate/scenario_daily.py
index fc5f870..ac349a2 100644
--- a/src/climate_downscale/generate/scenario_daily.py
+++ b/src/climate_downscale/generate/scenario_daily.py
@@ -101,7 +101,6 @@ def load_and_shift_longitude(
 
 def load_variable(
     member_path: str | Path,
-    variable: str,
     year: str | int,
 ) -> xr.Dataset:
     if year == "reference":
@@ -115,6 +114,7 @@ def load_variable(
             .interp(date=time_range)
             .interpolate_na(dim="date", method="nearest", fill_value="extrapolate")
         )
+    variable = str(next(iter(ds)))
     conversion = CONVERT_MAP[variable]
     ds = conversion(utils.rename_val_column(ds))
     return ds
@@ -172,11 +172,11 @@ def generate_scenario_daily_main(
         pid = f"{i}/{len(source_paths)}"
         print(f"{pid}: Loading reference")
         scenario_reference = transform_fun(  # type: ignore[operator]
-            *[load_variable(sp, target_variable, "reference") for sp in sps]
+            *[load_variable(sp, "reference") for sp in sps]
         )
         print(f"{pid}: Loading target")
         target = transform_fun(  # type: ignore[operator]
-            *[load_variable(sp, target_variable, year) for sp in sps]
+            *[load_variable(sp, year) for sp in sps]
         )
         print(f"{pid}: computing anomaly")
         s_anomaly = scale * compute_anomaly(scenario_reference, target, anomaly_type)

From 5c97e1226178d4075657259e9500ccfb1a6e0cad Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Sun, 16 Jun 2024 16:44:39 -0700
Subject: [PATCH 56/71] Need call to interp calendar

---
 src/climate_downscale/generate/scenario_daily.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/climate_downscale/generate/scenario_daily.py b/src/climate_downscale/generate/scenario_daily.py
index ac349a2..e11dd69 100644
--- a/src/climate_downscale/generate/scenario_daily.py
+++ b/src/climate_downscale/generate/scenario_daily.py
@@ -111,7 +111,7 @@ def load_variable(
         ds = load_and_shift_longitude(member_path, time_slice)
         ds = (
             ds.assign_coords(date=ds.date.dt.floor("D"))
-            .interp(date=time_range)
+            .interp_calendar(time_range, dim="date")
             .interpolate_na(dim="date", method="nearest", fill_value="extrapolate")
         )
     variable = str(next(iter(ds)))

From a5bef4fc18b429db5a08dcb9450e3c7f591a9fb6 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Mon, 17 Jun 2024 15:30:36 -0700
Subject: [PATCH 57/71] Lots of fidling to get things to work

---
 .../generate/scenario_daily.py                | 111 +++++++++++-------
 src/climate_downscale/generate/utils.py       |   4 +-
 2 files changed, 74 insertions(+), 41 deletions(-)

diff --git a/src/climate_downscale/generate/scenario_daily.py b/src/climate_downscale/generate/scenario_daily.py
index e11dd69..6fc2590 100644
--- a/src/climate_downscale/generate/scenario_daily.py
+++ b/src/climate_downscale/generate/scenario_daily.py
@@ -1,8 +1,10 @@
 import itertools
 import typing
 from pathlib import Path
+from collections import defaultdict
 
 import click
+import numpy as np
 import pandas as pd
 import xarray as xr
 from rra_tools import jobmon
@@ -91,10 +93,13 @@ def load_and_shift_longitude(
     time_slice: slice,
 ) -> xr.Dataset:
     ds = xr.open_dataset(ds_path).sel(time=time_slice).compute()
+    if ds.time.size == 0:
+        msg = 'No data in slice'
+        raise KeyError(msg)
     ds = (
         ds.assign_coords(lon=(ds.lon + 180) % 360 - 180)
         .sortby("lon")
-        .rename({"lat": "latitude", "lon": "longitude", "time": "date"})
+        .rename({"lat": "latitude", "lon": "longitude"})
     )
     return ds
 
@@ -104,15 +109,16 @@ def load_variable(
     year: str | int,
 ) -> xr.Dataset:
     if year == "reference":
-        ds = load_and_shift_longitude(member_path, utils.REFERENCE_PERIOD)
+        ds = load_and_shift_longitude(member_path, utils.REFERENCE_PERIOD).rename({"time": "date"})
     else:
         time_slice = slice(f"{year}-01-01", f"{year}-12-31")
         time_range = pd.date_range(f"{year}-01-01", f"{year}-12-31")
         ds = load_and_shift_longitude(member_path, time_slice)
         ds = (
-            ds.assign_coords(date=ds.date.dt.floor("D"))
-            .interp_calendar(time_range, dim="date")
-            .interpolate_na(dim="date", method="nearest", fill_value="extrapolate")
+            ds.assign_coords(time=ds.time.dt.floor("D"))
+            .interp_calendar(time_range)
+            .interpolate_na(dim="time", method="nearest", fill_value="extrapolate")
+            .rename({"time": "date"})
         )
     variable = str(next(iter(ds)))
     conversion = CONVERT_MAP[variable]
@@ -123,20 +129,15 @@ def load_variable(
 def compute_anomaly(
     reference: xr.Dataset, target: xr.Dataset, anomaly_type: str
 ) -> xr.Dataset:
+    reference = reference.groupby("date.month").mean("date")
     if anomaly_type == "additive":
-        anomaly = target.groupby("time.month") - reference
+        anomaly = target.groupby("date.month") - reference
     elif anomaly_type == "multiplicative":
-        anomaly = (target.groupby("time.month") + 1) / (reference + 1)  # type: ignore[operator]
+        anomaly = (target + 1).groupby("date.month") / (reference + 1)  # type: ignore[operator]
     else:
         msg = f"Unknown anomaly type: {anomaly_type}"
         raise ValueError(msg)
-
-    anomaly = (
-        anomaly.drop_vars("month")
-        .rename({"lat": "latitude", "lon": "longitude", "time": "date"})
-        .assign_coords(longitude=(anomaly.longitude + 180) % 360 - 180)
-        .sortby("longitude")
-    )
+    anomaly = anomaly.drop_vars("month")
     return anomaly
 
 
@@ -152,11 +153,24 @@ def generate_scenario_daily_main(
         TRANSFORM_MAP[target_variable]
     )
 
-    paths_by_var = [
-        list(cd_data.extracted_cmip6.glob(f"{source_variable}_{cmip6_experiment}*.nc"))
-        for source_variable in source_variables
+    models_by_var = {}
+    for source_variable in source_variables:
+        model_vars = set([
+            p.stem.split(f"{cmip6_experiment}_")[1]
+            for p in cd_data.extracted_cmip6.glob(f"{source_variable}_{cmip6_experiment}*.nc")
+        ])
+        models_by_var[source_variable] = model_vars
+
+    shared_models = set.intersection(*models_by_var.values())
+    for var, models in models_by_var.items():
+        extra_models = models.difference(shared_models)
+        if extra_models:
+            print(var, extra_models)
+    source_paths = [
+        [cd_data.extracted_cmip6 / f'{source_variable}_{cmip6_experiment}_{model}.nc'
+         for source_variable in source_variables]
+        for model in sorted(shared_models)
     ]
-    source_paths = list(zip(*paths_by_var, strict=True))
 
     print("loading historical reference")
     historical_reference = cd_data.load_daily_results(
@@ -165,30 +179,48 @@ def generate_scenario_daily_main(
         year="reference",
     )
 
-    print("Making memory buffer")
-    scale = 1 / len(source_paths)
-    anomaly = xr.zeros_like(historical_reference)
+    anomalies = {}
+    source_paths = source_paths
     for i, sps in enumerate(source_paths):
-        pid = f"{i}/{len(source_paths)}"
+        pid = f"{i+1}/{len(source_paths)} {sps[0].stem}"
         print(f"{pid}: Loading reference")
-        scenario_reference = transform_fun(  # type: ignore[operator]
-            *[load_variable(sp, "reference") for sp in sps]
-        )
-        print(f"{pid}: Loading target")
-        target = transform_fun(  # type: ignore[operator]
-            *[load_variable(sp, year) for sp in sps]
-        )
+        try:
+            scenario_reference = transform_fun(  # type: ignore[operator]
+                *[load_variable(sp, "reference") for sp in sps]
+            )
+            print(f"{pid}: Loading target")
+            target = transform_fun(  # type: ignore[operator]
+                *[load_variable(sp, year) for sp in sps]
+            )
+        except KeyError:
+            print(f"{pid}: Bad formatting, skipping...")
+            continue
         print(f"{pid}: computing anomaly")
-        s_anomaly = scale * compute_anomaly(scenario_reference, target, anomaly_type)
-        print(f"{pid}: downscaling anomaly")
-        anomaly += utils.interpolate_to_target_latlon(s_anomaly, method="linear")
+        s_anomaly = compute_anomaly(scenario_reference, target, anomaly_type)
+        key = f"{len(s_anomaly.latitude)}_{len(s_anomaly.longitude)}"
+        old = anomalies.get(key, 0)
+        if old:
+            for coord in ['latitude', 'longitude']:
+                old_c = old[coord].to_numpy()
+                new_c = s_anomaly[coord].to_numpy()
+                if np.abs(old_c - new_c).max() < 1e-5:
+                    s_anomaly = s_anomaly.assign(**{coord: old_c})
+                else:
+                    msg = f"{coord} does not match despite having the same subdivision"
+                    raise ValueError(msg)
+        anomalies[key] = old + s_anomaly
+    anomaly = 0
+    for i, (k, v) in enumerate(anomalies.items()):
+        print(f"Downscaling {i+1}/{len(anomalies)}: {k}")
+        anomaly += utils.interpolate_to_target_latlon(v, method="linear")
+    anomaly /= len(source_paths)
 
     print("Computing scenario data")
     if anomaly_type == "additive":
-        scenario_data = historical_reference + anomaly
+        scenario_data = historical_reference + anomaly.groupby('date.month')
     else:
-        scenario_data = historical_reference * anomaly
-
+        scenario_data = historical_reference * anomaly.groupby('date.month')
+    scenario_data = scenario_data.drop_vars('month')
     print("Saving")
     cd_data.save_daily_results(
         scenario_data,
@@ -245,19 +277,18 @@ def generate_scenario_daily(
     yve = []
     complete = []
     for y, v, e in itertools.product(years, variables, experiments):
-        path = cd_data.daily_results_path(y, v, e)
+        path = cd_data.daily_results_path(scenario=e, variable=v, year=y)
         if not path.exists() or overwrite:
             yve.append((y, v, e))
         else:
             complete.append((y, v, e))
 
     print(f"{len(complete)} tasks already done. " f"Launching {len(yve)} tasks")
-
     jobmon.run_parallel(
         runner="cdtask",
         task_name="generate scenario_daily",
         flat_node_args=(
-            ("year", "target-variable", "cmip-experiment"),
+            ("year", "target-variable", "cmip6-experiment"),
             yve,
         ),
         task_args={
@@ -266,8 +297,8 @@ def generate_scenario_daily(
         task_resources={
             "queue": queue,
             "cores": 5,
-            "memory": "200G",
-            "runtime": "240m",
+            "memory": "120G",
+            "runtime": "400m",
             "project": "proj_rapidresponse",
         },
         max_attempts=1,
diff --git a/src/climate_downscale/generate/utils.py b/src/climate_downscale/generate/utils.py
index 75fc0ad..8ed5cb0 100644
--- a/src/climate_downscale/generate/utils.py
+++ b/src/climate_downscale/generate/utils.py
@@ -7,7 +7,7 @@
     np.round(np.arange(-180.0, 180.0, 0.1, dtype="float32"), 1), dims="longitude"
 )
 TARGET_LAT = xr.DataArray(
-    np.round(np.arange(90.0, -90.1, -0.1, dtype="float32"), 1), dims="latitude"
+    np.round(np.arange(-90.0, 90.1, 0.1, dtype="float32"), 1), dims="latitude"
 )
 
 #############################
@@ -293,5 +293,7 @@ def interpolate_to_target_latlon(
     return (
         ds.interp(longitude=TARGET_LON, latitude=TARGET_LAT, method=method)  # type: ignore[arg-type]
         .interpolate_na(dim="longitude", method="nearest", fill_value="extrapolate")
+        .sortby('latitude')
         .interpolate_na(dim="latitude", method="nearest", fill_value="extrapolate")
+        .sortby('latitude', ascending=False)
     )

From 71a81b925c8021782014a4894810c5d2c88c720b Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Mon, 17 Jun 2024 17:09:22 -0700
Subject: [PATCH 58/71] Add annual scenario

---
 src/climate_downscale/data.py                 |  31 ++-
 src/climate_downscale/generate/__init__.py    |   6 +
 .../generate/scenario_annual.py               | 255 ++++++++++++++++++
 .../generate/scenario_daily.py                |  58 ++--
 src/climate_downscale/generate/utils.py       |  68 +++--
 5 files changed, 372 insertions(+), 46 deletions(-)
 create mode 100644 src/climate_downscale/generate/scenario_annual.py

diff --git a/src/climate_downscale/data.py b/src/climate_downscale/data.py
index f27355d..cffdf82 100644
--- a/src/climate_downscale/data.py
+++ b/src/climate_downscale/data.py
@@ -52,7 +52,9 @@ def load_cmip6_metadata(self) -> pd.DataFrame:
             meta.to_parquet(meta_path)
         return pd.read_parquet(meta_path)
 
-    def extracted_cmip6_path(self, variable: str, experiment: str, source: str, member: str) -> Path:
+    def extracted_cmip6_path(
+        self, variable: str, experiment: str, source: str, member: str
+    ) -> Path:
         return self.extracted_cmip6 / f"{variable}_{experiment}_{source}_{member}.nc"
 
     @property
@@ -157,6 +159,33 @@ def load_daily_results(
         results_path = self.daily_results_path(scenario, variable, year)
         return xr.open_dataset(results_path)
 
+    @property
+    def annual_results(self) -> Path:
+        return self.results / "annual"
+
+    def annual_results_path(self, scenario: str, variable: str) -> Path:
+        return self.annual_results / scenario / f"{variable}.nc"
+
+    def save_annual_results(
+        self,
+        results_ds: xr.Dataset,
+        scenario: str,
+        variable: str,
+        encoding_kwargs: dict[str, Any],
+    ) -> None:
+        path = self.annual_results_path(scenario, variable)
+        mkdir(path.parent, exist_ok=True, parents=True)
+        touch(path, exist_ok=True)
+
+        encoding = {
+            "dtype": "int16",
+            "_FillValue": -32767,
+            "zlib": True,
+            "complevel": 1,
+        }
+        encoding.update(encoding_kwargs)
+        results_ds.to_netcdf(path, encoding={"value": encoding})
+
 
 def save_raster(
     raster: rt.RasterArray,
diff --git a/src/climate_downscale/generate/__init__.py b/src/climate_downscale/generate/__init__.py
index 10b0563..4f4afa8 100644
--- a/src/climate_downscale/generate/__init__.py
+++ b/src/climate_downscale/generate/__init__.py
@@ -6,6 +6,10 @@
     generate_historical_reference,
     generate_historical_reference_task,
 )
+from climate_downscale.generate.scenario_annual import (
+    generate_scenario_annual,
+    generate_scenario_annual_task,
+)
 from climate_downscale.generate.scenario_daily import (
     generate_scenario_daily,
     generate_scenario_daily_task,
@@ -15,10 +19,12 @@
     "historical_daily": generate_historical_daily,
     "historical_reference": generate_historical_reference,
     "scenario_daily": generate_scenario_daily,
+    "scenario_annual": generate_scenario_annual,
 }
 
 TASK_RUNNERS = {
     "historical_daily": generate_historical_daily_task,
     "historical_reference": generate_historical_reference_task,
     "scenario_daily": generate_scenario_daily_task,
+    "scenario_annual": generate_scenario_annual_task,
 }
diff --git a/src/climate_downscale/generate/scenario_annual.py b/src/climate_downscale/generate/scenario_annual.py
new file mode 100644
index 0000000..55aa233
--- /dev/null
+++ b/src/climate_downscale/generate/scenario_annual.py
@@ -0,0 +1,255 @@
+import itertools
+import typing
+from pathlib import Path
+
+import click
+import xarray as xr
+from rra_tools import jobmon
+
+from climate_downscale import cli_options as clio
+from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
+from climate_downscale.generate import utils
+from climate_downscale.generate.scenario_daily import VALID_YEARS
+
+YEARS = {
+    "historical": clio.VALID_YEARS,
+    "scenario": VALID_YEARS,
+}
+TEMP_THRESHOLDS = list(range(20, 35))
+
+
+class Transform:
+    def __init__(
+        self,
+        source_variables: list[str],
+        transform_funcs: list[typing.Callable[..., xr.Dataset]] = [utils.annual_mean],  # noqa: B006
+        encoding_scale: float = 1.0,
+        encoding_offset: float = 0.0,
+    ):
+        self.source_variables = source_variables
+        self.transform_funcs = transform_funcs
+        self.encoding_scale = encoding_scale
+        self.encoding_offset = encoding_offset
+
+    def __iter__(self) -> typing.Iterator[str]:
+        return iter(self.source_variables)
+
+    def __call__(self, *datasets: xr.Dataset) -> xr.Dataset:
+        res = self.transform_funcs[0](*datasets)
+        for transform_func in self.transform_funcs[1:]:
+            res = transform_func(res)
+        return res
+
+    @property
+    def encoding_kwargs(self) -> dict[str, float]:
+        return {"add_offset": self.encoding_offset, "scale_factor": self.encoding_scale}
+
+
+TRANSFORM_MAP = {
+    "mean_temperature": Transform(
+        source_variables=["mean_temperature"],
+        encoding_scale=0.01,
+        encoding_offset=273.15,
+    ),
+    "mean_high_temperature": Transform(
+        source_variables=["max_temperature"],
+        encoding_scale=0.01,
+        encoding_offset=273.15,
+    ),
+    "mean_low_temperature": Transform(
+        source_variables=["min_temperature"],
+        encoding_scale=0.01,
+        encoding_offset=273.15,
+    ),
+    **{
+        f"days_over_{temp}C": Transform(
+            source_variables=["mean_temperature"],
+            transform_funcs=[utils.count_threshold(temp), utils.annual_sum],
+        )
+        for temp in TEMP_THRESHOLDS
+    },
+    "mean_heat_index": Transform(
+        source_variables=["mean_temperature", "relative_humidity"],
+        transform_funcs=[utils.heat_index, utils.annual_mean],
+        encoding_scale=0.01,
+        encoding_offset=273.15,
+    ),
+    **{
+        f"days_over_{temp}C_heat_index": Transform(
+            source_variables=["mean_temperature", "relative_humidity"],
+            transform_funcs=[
+                utils.heat_index,
+                utils.count_threshold(temp),
+                utils.annual_sum,
+            ],
+        )
+        for temp in TEMP_THRESHOLDS
+    },
+    "mean_humidex": Transform(
+        source_variables=["mean_temperature", "relative_humidity"],
+        transform_funcs=[utils.humidex, utils.annual_mean],
+        encoding_scale=0.01,
+        encoding_offset=273.15,
+    ),
+    **{
+        f"days_over_{temp}C_humidex": Transform(
+            source_variables=["mean_temperature", "relative_humidity"],
+            transform_funcs=[
+                utils.humidex,
+                utils.count_threshold(temp),
+                utils.annual_sum,
+            ],
+        )
+        for temp in TEMP_THRESHOLDS
+    },
+    "mean_effective_temperature": Transform(
+        source_variables=["mean_temperature", "relative_humidity", "wind_speed"],
+        transform_funcs=[utils.effective_temperature, utils.annual_mean],
+        encoding_scale=0.01,
+        encoding_offset=273.15,
+    ),
+    **{
+        f"days_over_{temp}C_effective_temperature": Transform(
+            source_variables=["mean_temperature", "relative_humidity", "wind_speed"],
+            transform_funcs=[
+                utils.effective_temperature,
+                utils.count_threshold(temp),
+                utils.annual_sum,
+            ],
+        )
+        for temp in TEMP_THRESHOLDS
+    },
+    "wind_speed": Transform(
+        source_variables=["wind_speed"],
+        encoding_scale=0.01,
+    ),
+    "relative_humidity": Transform(
+        source_variables=["relative_humidity"],
+        encoding_scale=0.01,
+    ),
+    "total_precipitation": Transform(
+        source_variables=["total_precipitation"],
+        transform_funcs=[utils.annual_sum],
+        encoding_scale=0.1,
+    ),
+}
+
+
+_P = typing.ParamSpec("_P")
+_T = typing.TypeVar("_T")
+
+
+def with_target_variable(
+    *,
+    allow_all: bool = False,
+) -> clio.ClickOption[_P, _T]:
+    return clio.with_choice(
+        "target-variable",
+        "t",
+        allow_all=allow_all,
+        choices=list(TRANSFORM_MAP.keys()),
+        help="Variable to generate.",
+    )
+
+
+def generate_scenario_annual_main(
+    output_dir: str | Path,
+    target_variable: str,
+    scenario: str,
+) -> None:
+    cd_data = ClimateDownscaleData(output_dir)
+
+    transform = TRANSFORM_MAP[target_variable]
+
+    annual_data = []
+    for scenario_label, year_list in YEARS.items():
+        scenario_label = scenario if scenario_label == "scenario" else "historical"  # noqa: PLW2901
+        for year in year_list:
+            print(f"Loading {scenario_label} {year} data for {target_variable}")
+            ds = transform(
+                *[
+                    cd_data.load_daily_results(scenario_label, source_variable, year)
+                    for source_variable in transform
+                ]
+            )
+            annual_data.append(ds)
+
+    annual_ds = xr.concat(annual_data, dim="year")
+    cd_data.save_annual_results(
+        annual_ds,
+        scenario=scenario,
+        variable=target_variable,
+        encoding_kwargs=transform.encoding_kwargs,
+    )
+
+
+@click.command()  # type: ignore[arg-type]
+@clio.with_output_directory(DEFAULT_ROOT)
+@with_target_variable()
+@clio.with_cmip6_experiment()
+def generate_scenario_annual_task(
+    output_dir: str,
+    target_variable: str,
+    cmip6_experiment: str,
+) -> None:
+    generate_scenario_annual_main(output_dir, target_variable, cmip6_experiment)
+
+
+@click.command()  # type: ignore[arg-type]
+@clio.with_output_directory(DEFAULT_ROOT)
+@with_target_variable(allow_all=True)
+@clio.with_cmip6_experiment(allow_all=True)
+@clio.with_queue()
+@clio.with_overwrite()
+def generate_scenario_annual(
+    output_dir: str,
+    target_variable: str,
+    cmip6_experiment: str,
+    queue: str,
+    overwrite: bool,  # noqa: FBT001
+) -> None:
+    cd_data = ClimateDownscaleData(output_dir)
+
+    variables = (
+        list(TRANSFORM_MAP.keys())
+        if target_variable == clio.RUN_ALL
+        else [target_variable]
+    )
+    experiments = (
+        list(clio.VALID_CMIP6_EXPERIMENTS)
+        if cmip6_experiment == clio.RUN_ALL
+        else [cmip6_experiment]
+    )
+
+    ve = []
+    complete = []
+    for v, e in itertools.product(variables, experiments):
+        path = cd_data.annual_results_path(scenario=e, variable=v)
+        if not path.exists() or overwrite:
+            ve.append((v, e))
+        else:
+            complete.append((v, e))
+
+    print(f"{len(complete)} tasks already done. {len(ve)} tasks to do.")
+    if not ve:
+        return
+
+    jobmon.run_parallel(
+        runner="cdtask",
+        task_name="generate scenario_daily",
+        flat_node_args=(
+            ("target-variable", "cmip6-experiment"),
+            ve,
+        ),
+        task_args={
+            "output-dir": output_dir,
+        },
+        task_resources={
+            "queue": queue,
+            "cores": 5,
+            "memory": "120G",
+            "runtime": "400m",
+            "project": "proj_rapidresponse",
+        },
+        max_attempts=1,
+    )
diff --git a/src/climate_downscale/generate/scenario_daily.py b/src/climate_downscale/generate/scenario_daily.py
index 6fc2590..6ec2b2a 100644
--- a/src/climate_downscale/generate/scenario_daily.py
+++ b/src/climate_downscale/generate/scenario_daily.py
@@ -1,7 +1,6 @@
 import itertools
 import typing
 from pathlib import Path
-from collections import defaultdict
 
 import click
 import numpy as np
@@ -94,7 +93,7 @@ def load_and_shift_longitude(
 ) -> xr.Dataset:
     ds = xr.open_dataset(ds_path).sel(time=time_slice).compute()
     if ds.time.size == 0:
-        msg = 'No data in slice'
+        msg = "No data in slice"
         raise KeyError(msg)
     ds = (
         ds.assign_coords(lon=(ds.lon + 180) % 360 - 180)
@@ -109,7 +108,9 @@ def load_variable(
     year: str | int,
 ) -> xr.Dataset:
     if year == "reference":
-        ds = load_and_shift_longitude(member_path, utils.REFERENCE_PERIOD).rename({"time": "date"})
+        ds = load_and_shift_longitude(member_path, utils.REFERENCE_PERIOD).rename(
+            {"time": "date"}
+        )
     else:
         time_slice = slice(f"{year}-01-01", f"{year}-12-31")
         time_range = pd.date_range(f"{year}-01-01", f"{year}-12-31")
@@ -133,7 +134,7 @@ def compute_anomaly(
     if anomaly_type == "additive":
         anomaly = target.groupby("date.month") - reference
     elif anomaly_type == "multiplicative":
-        anomaly = (target + 1).groupby("date.month") / (reference + 1)  # type: ignore[operator]
+        anomaly = (target + 1).groupby("date.month") / (reference + 1)
     else:
         msg = f"Unknown anomaly type: {anomaly_type}"
         raise ValueError(msg)
@@ -141,7 +142,7 @@ def compute_anomaly(
     return anomaly
 
 
-def generate_scenario_daily_main(
+def generate_scenario_daily_main(  # noqa: C901, PLR0912, PLR0915
     output_dir: str | Path,
     year: str | int,
     target_variable: str,
@@ -155,10 +156,12 @@ def generate_scenario_daily_main(
 
     models_by_var = {}
     for source_variable in source_variables:
-        model_vars = set([
+        model_vars = {
             p.stem.split(f"{cmip6_experiment}_")[1]
-            for p in cd_data.extracted_cmip6.glob(f"{source_variable}_{cmip6_experiment}*.nc")
-        ])
+            for p in cd_data.extracted_cmip6.glob(
+                f"{source_variable}_{cmip6_experiment}*.nc"
+            )
+        }
         models_by_var[source_variable] = model_vars
 
     shared_models = set.intersection(*models_by_var.values())
@@ -167,8 +170,10 @@ def generate_scenario_daily_main(
         if extra_models:
             print(var, extra_models)
     source_paths = [
-        [cd_data.extracted_cmip6 / f'{source_variable}_{cmip6_experiment}_{model}.nc'
-         for source_variable in source_variables]
+        [
+            cd_data.extracted_cmip6 / f"{source_variable}_{cmip6_experiment}_{model}.nc"
+            for source_variable in source_variables
+        ]
         for model in sorted(shared_models)
     ]
 
@@ -179,8 +184,7 @@ def generate_scenario_daily_main(
         year="reference",
     )
 
-    anomalies = {}
-    source_paths = source_paths
+    anomalies: dict[str, xr.Dataset] = {}
     for i, sps in enumerate(source_paths):
         pid = f"{i+1}/{len(source_paths)} {sps[0].stem}"
         print(f"{pid}: Loading reference")
@@ -198,29 +202,37 @@ def generate_scenario_daily_main(
         print(f"{pid}: computing anomaly")
         s_anomaly = compute_anomaly(scenario_reference, target, anomaly_type)
         key = f"{len(s_anomaly.latitude)}_{len(s_anomaly.longitude)}"
-        old = anomalies.get(key, 0)
-        if old:
-            for coord in ['latitude', 'longitude']:
+
+        if key in anomalies:
+            old = anomalies[key]
+            for coord in ["latitude", "longitude"]:
                 old_c = old[coord].to_numpy()
                 new_c = s_anomaly[coord].to_numpy()
-                if np.abs(old_c - new_c).max() < 1e-5:
-                    s_anomaly = s_anomaly.assign(**{coord: old_c})
+                tol = 1e-5
+                if np.abs(old_c - new_c).max() < tol:
+                    s_anomaly = s_anomaly.assign({coord: old_c})
                 else:
                     msg = f"{coord} does not match despite having the same subdivision"
                     raise ValueError(msg)
-        anomalies[key] = old + s_anomaly
-    anomaly = 0
+            anomalies[key] = old + s_anomaly
+        else:
+            anomalies[key] = s_anomaly
+
+    anomaly = xr.Dataset()
     for i, (k, v) in enumerate(anomalies.items()):
         print(f"Downscaling {i+1}/{len(anomalies)}: {k}")
-        anomaly += utils.interpolate_to_target_latlon(v, method="linear")
+        if anomaly.nbytes:
+            anomaly += utils.interpolate_to_target_latlon(v, method="linear")
+        else:
+            anomaly = utils.interpolate_to_target_latlon(v, method="linear")
     anomaly /= len(source_paths)
 
     print("Computing scenario data")
     if anomaly_type == "additive":
-        scenario_data = historical_reference + anomaly.groupby('date.month')
+        scenario_data = historical_reference + anomaly.groupby("date.month")
     else:
-        scenario_data = historical_reference * anomaly.groupby('date.month')
-    scenario_data = scenario_data.drop_vars('month')
+        scenario_data = historical_reference * anomaly.groupby("date.month")
+    scenario_data = scenario_data.drop_vars("month")
     print("Saving")
     cd_data.save_daily_results(
         scenario_data,
diff --git a/src/climate_downscale/generate/utils.py b/src/climate_downscale/generate/utils.py
index 8ed5cb0..e949f27 100644
--- a/src/climate_downscale/generate/utils.py
+++ b/src/climate_downscale/generate/utils.py
@@ -1,3 +1,5 @@
+from collections.abc import Callable
+
 import numpy as np
 import xarray as xr
 
@@ -99,18 +101,41 @@ def daily_mean(ds: xr.Dataset) -> xr.Dataset:
     return ds.groupby("time.date").mean()
 
 
+def annual_mean(ds: xr.Dataset) -> xr.Dataset:
+    return ds.groupby("date.year").mean()
+
+
 def daily_max(ds: xr.Dataset) -> xr.Dataset:
     return ds.groupby("time.date").max()
 
 
+def annual_max(ds: xr.Dataset) -> xr.Dataset:
+    return ds.groupby("date.year").max()
+
+
 def daily_min(ds: xr.Dataset) -> xr.Dataset:
     return ds.groupby("time.date").min()
 
 
+def annual_min(ds: xr.Dataset) -> xr.Dataset:
+    return ds.groupby("date.year").min()
+
+
 def daily_sum(ds: xr.Dataset) -> xr.Dataset:
     return ds.groupby("time.date").sum()
 
 
+def annual_sum(ds: xr.Dataset) -> xr.Dataset:
+    return ds.groupby("date.year").sum()
+
+
+def count_threshold(threshold: int | float) -> Callable[[xr.Dataset], xr.Dataset]:
+    def count(ds: xr.Dataset) -> xr.Dataset:
+        return ds > threshold
+
+    return count
+
+
 ########################
 # Data transformations #
 ########################
@@ -173,7 +198,7 @@ def rh_percent(
 
 def heat_index(
     temperature_c: xr.Dataset,
-    dewpoint_temperature_c: xr.Dataset,
+    relative_humidity_percent: xr.Dataset,
 ) -> xr.Dataset:
     """Calculate the heat index.
 
@@ -183,16 +208,17 @@ def heat_index(
     ----------
     temperature_c
         Temperature in Celsius
-    dewpoint_temperature_c
-        Dewpoint temperature in Celsius
+    relative_humidity_percent
+        Relative humidity as a percentage
 
     Returns
     -------
     xr.Dataset
         Heat index in Celsius
     """
-    t = temperature_c  # Alias for simplicity in the formula
-    r = rh_percent(temperature_c, dewpoint_temperature_c)
+    # Alias for simplicity in the formula
+    t = temperature_c
+    r = relative_humidity_percent
 
     # Heat index formula from canonical multi-variable regression
     hi_raw = (
@@ -214,7 +240,7 @@ def heat_index(
 
 def humidex(
     temperature_c: xr.Dataset,
-    dewpoint_temperature_c: xr.Dataset,
+    relative_humidity_percent: xr.Dataset,
 ) -> xr.Dataset:
     """Calculate the humidex.
 
@@ -224,23 +250,23 @@ def humidex(
     ----------
     temperature_c
         Temperature in Celsius
-    dewpoint_temperature_c
-        Dewpoint temperature in Celsius
+    relative_humidity_percent
+        Relative humidity as a percentage
 
     Returns
     -------
     xr.Dataset
         Humidex in Celsius
     """
-    vp = buck_vapor_pressure(dewpoint_temperature_c)
+    svp = buck_vapor_pressure(temperature_c)
+    vp = relative_humidity_percent / 100 * svp
     return temperature_c + 0.5555 * (vp - 10)
 
 
 def effective_temperature(
     temperature_c: xr.Dataset,
-    dewpoint_temperature_c: xr.Dataset,
-    uas: xr.Dataset,
-    vas: xr.Dataset,
+    relative_humidity_percent: xr.Dataset,
+    wind_speed_m_s: xr.Dataset,
 ) -> xr.Dataset:
     """Calculate the effective temperature.
 
@@ -250,12 +276,10 @@ def effective_temperature(
     ----------
     temperature_c
         Temperature in Celsius
-    dewpoint_temperature_c
-        Dewpoint temperature in Celsius
-    uas
-        U-component of wind speed
-    vas
-        V-component of wind speed
+    relative_humidity_percent
+        Relative humidity as a percentage
+    wind_speed_m_s
+        Wind speed in m/s
 
     Returns
     -------
@@ -264,8 +288,8 @@ def effective_temperature(
     """
     # Alias for simplicity in the formula
     t = temperature_c
-    r = rh_percent(temperature_c, dewpoint_temperature_c)
-    v = vector_magnitude(uas, vas)
+    r = relative_humidity_percent
+    v = wind_speed_m_s
 
     wind_adjustment = 1 / (1.76 + 1.4 * v**0.75)
     et = (
@@ -293,7 +317,7 @@ def interpolate_to_target_latlon(
     return (
         ds.interp(longitude=TARGET_LON, latitude=TARGET_LAT, method=method)  # type: ignore[arg-type]
         .interpolate_na(dim="longitude", method="nearest", fill_value="extrapolate")
-        .sortby('latitude')
+        .sortby("latitude")
         .interpolate_na(dim="latitude", method="nearest", fill_value="extrapolate")
-        .sortby('latitude', ascending=False)
+        .sortby("latitude", ascending=False)
     )

From ff75c62b6ee693215e16e4aebdf930d87c41e7ae Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Wed, 19 Jun 2024 09:04:45 -0700
Subject: [PATCH 59/71] Catch empty workflow error

---
 src/climate_downscale/extract/era5.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/climate_downscale/extract/era5.py b/src/climate_downscale/extract/era5.py
index 95f49f2..3322898 100644
--- a/src/climate_downscale/extract/era5.py
+++ b/src/climate_downscale/extract/era5.py
@@ -246,6 +246,9 @@ def extract_era5(  # noqa: PLR0913
             to_download.append(spec)
             to_compress.append(spec)
 
+    if not to_download:
+        print('No datasets to download')
+
     while to_download:
         downloads_left = len(to_download)
 
@@ -285,6 +288,10 @@ def extract_era5(  # noqa: PLR0913
             max_attempts=1,
         )
 
+    if not to_compress:
+        print('No datasets to compress.')
+        return
+
     jobmon.run_parallel(
         runner="cdtask",
         task_name="extract era5_compress",

From 107e5c66889aa62094246da6ca662317482a5d99 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Wed, 19 Jun 2024 09:05:09 -0700
Subject: [PATCH 60/71] Get annual working

---
 .../generate/scenario_annual.py               | 44 +++++++++++--------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/src/climate_downscale/generate/scenario_annual.py b/src/climate_downscale/generate/scenario_annual.py
index 55aa233..bb32fbb 100644
--- a/src/climate_downscale/generate/scenario_annual.py
+++ b/src/climate_downscale/generate/scenario_annual.py
@@ -42,7 +42,9 @@ def __call__(self, *datasets: xr.Dataset) -> xr.Dataset:
 
     @property
     def encoding_kwargs(self) -> dict[str, float]:
-        return {"add_offset": self.encoding_offset, "scale_factor": self.encoding_scale}
+        if self.encoding_offset != 0. or self.encoding_scale != 1:
+            return {"add_offset": self.encoding_offset, "scale_factor": self.encoding_scale}
+        return {}
 
 
 TRANSFORM_MAP = {
@@ -161,22 +163,26 @@ def generate_scenario_annual_main(
 
     transform = TRANSFORM_MAP[target_variable]
 
-    annual_data = []
-    for scenario_label, year_list in YEARS.items():
-        scenario_label = scenario if scenario_label == "scenario" else "historical"  # noqa: PLW2901
-        for year in year_list:
-            print(f"Loading {scenario_label} {year} data for {target_variable}")
-            ds = transform(
-                *[
-                    cd_data.load_daily_results(scenario_label, source_variable, year)
-                    for source_variable in transform
-                ]
+    
+    variables = []
+    for source_variable in transform:
+        paths = []
+        for scenario_label, year_list in YEARS.items():
+            s = "historical" if scenario_label == "historical" else scenario
+            for year in year_list:            
+                paths.append(cd_data.daily_results_path(s, source_variable, year))
+        variables.append(
+            xr.open_mfdataset(
+                paths, 
+                parallel=True, 
+                chunks={'date': -1, 'latitude': 601, 'longitude': 1200},
             )
-            annual_data.append(ds)
-
-    annual_ds = xr.concat(annual_data, dim="year")
+        )
+    ds = transform(*variables).compute()
+    
+    
     cd_data.save_annual_results(
-        annual_ds,
+        ds,
         scenario=scenario,
         variable=target_variable,
         encoding_kwargs=transform.encoding_kwargs,
@@ -236,7 +242,7 @@ def generate_scenario_annual(
 
     jobmon.run_parallel(
         runner="cdtask",
-        task_name="generate scenario_daily",
+        task_name="generate scenario_annual",
         flat_node_args=(
             ("target-variable", "cmip6-experiment"),
             ve,
@@ -246,9 +252,9 @@ def generate_scenario_annual(
         },
         task_resources={
             "queue": queue,
-            "cores": 5,
-            "memory": "120G",
-            "runtime": "400m",
+            "cores": 20,
+            "memory": "250G",
+            "runtime": "600m",
             "project": "proj_rapidresponse",
         },
         max_attempts=1,

From b643ca352c08a0e40f3589d5a0078bd8569c8056 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Wed, 19 Jun 2024 09:27:59 -0700
Subject: [PATCH 61/71] make year usage more coherent

---
 src/climate_downscale/cli_options.py          | 16 +++-
 .../downscale/prepare_training_data.py        |  4 +-
 src/climate_downscale/extract/era5.py         | 92 +++++++++++--------
 .../generate/historical_daily.py              |  6 +-
 .../generate/historical_reference.py          |  3 +-
 .../generate/scenario_annual.py               | 42 ++++-----
 .../generate/scenario_daily.py                |  8 +-
 src/climate_downscale/generate/utils.py       |  8 +-
 8 files changed, 102 insertions(+), 77 deletions(-)

diff --git a/src/climate_downscale/cli_options.py b/src/climate_downscale/cli_options.py
index 38117ce..1612f6e 100644
--- a/src/climate_downscale/cli_options.py
+++ b/src/climate_downscale/cli_options.py
@@ -18,12 +18,14 @@
 _P = ParamSpec("_P")
 
 
-VALID_YEARS = [str(y) for y in range(1990, 2024)]
+VALID_HISTORY_YEARS = [str(y) for y in range(1990, 2024)]
+VALID_REFERENCE_YEARS = VALID_HISTORY_YEARS[-5:]
+VALID_FORECAST_YEARS = [str(y) for y in range(2024, 2101)]
 
 
 def with_year(
     *,
-    years: list[str] = VALID_YEARS,
+    years: list[str],
     allow_all: bool = False,
 ) -> ClickOption[_P, _T]:
     return with_choice(
@@ -132,12 +134,16 @@ def with_cmip6_source(
 def with_cmip6_experiment(
     *,
     allow_all: bool = False,
+    allow_historical: bool = False,
 ) -> ClickOption[_P, _T]:
+    choices = VALID_CMIP6_EXPERIMENTS[:]
+    if allow_historical:
+        choices.append("historical")
     return with_choice(
         "cmip6-experiment",
         "e",
         allow_all=allow_all,
-        choices=VALID_CMIP6_EXPERIMENTS,
+        choices=choices,
         help="CMIP6 experiment to extract.",
     )
 
@@ -204,7 +210,9 @@ def with_overwrite() -> ClickOption[_P, _T]:
 
 
 __all__ = [
-    "VALID_YEARS",
+    "VALID_HISTORY_YEARS",
+    "VALID_REFERENCE_YEARS",
+    "VALID_FORECAST_YEARS",
     "VALID_MONTHS",
     "VALID_ERA5_VARIABLES",
     "VALID_ERA5_DATASETS",
diff --git a/src/climate_downscale/downscale/prepare_training_data.py b/src/climate_downscale/downscale/prepare_training_data.py
index 570bb4e..fbc7d7d 100644
--- a/src/climate_downscale/downscale/prepare_training_data.py
+++ b/src/climate_downscale/downscale/prepare_training_data.py
@@ -102,7 +102,7 @@ def prepare_training_data_main(output_dir: str | Path, year: str) -> None:
 
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
-@clio.with_year()
+@clio.with_year(years=clio.VALID_HISTORY_YEARS)
 def prepare_training_data_task(output_dir: str, year: str) -> None:
     prepare_training_data_main(output_dir, year)
 
@@ -115,7 +115,7 @@ def prepare_training_data(output_dir: str, queue: str) -> None:
         runner="cdtask",
         task_name="downscale prepare_training_data",
         node_args={
-            "year": clio.VALID_YEARS,
+            "year": clio.VALID_HISTORY_YEARS,
         },
         task_args={
             "output-dir": output_dir,
diff --git a/src/climate_downscale/extract/era5.py b/src/climate_downscale/extract/era5.py
index 3322898..8b33e43 100644
--- a/src/climate_downscale/extract/era5.py
+++ b/src/climate_downscale/extract/era5.py
@@ -75,7 +75,7 @@ def download_era5_main(
         print(f"Failed to download {era5_dataset} {era5_variable} {year} {month}")
         if download_path.exists():
             download_path.unlink()
-        raise e  # noqa: TRY201
+        raise e
 
 
 def unzip_and_compress_era5(
@@ -138,7 +138,7 @@ def unzip_and_compress_era5(
 @clio.with_output_directory(DEFAULT_ROOT)
 @clio.with_era5_dataset()
 @clio.with_era5_variable()
-@clio.with_year()
+@clio.with_year(years=clio.VALID_HISTORY_YEARS)
 @clio.with_month()
 @click.option(
     "--user",
@@ -166,7 +166,7 @@ def download_era5_task(
 @clio.with_output_directory(DEFAULT_ROOT)
 @clio.with_era5_dataset()
 @clio.with_era5_variable()
-@clio.with_year()
+@clio.with_year(years=clio.VALID_HISTORY_YEARS)
 @clio.with_month()
 def unzip_and_compress_era5_task(
     output_dir: str,
@@ -184,40 +184,14 @@ def unzip_and_compress_era5_task(
     )
 
 
-@click.command()  # type: ignore[arg-type]
-@clio.with_output_directory(DEFAULT_ROOT)
-@clio.with_era5_dataset(allow_all=True)
-@clio.with_era5_variable(allow_all=True)
-@clio.with_year(allow_all=True)
-@clio.with_month(allow_all=True)
-@clio.with_queue()
-def extract_era5(  # noqa: PLR0913
-    output_dir: str,
-    era5_dataset: str,
-    era5_variable: str,
-    year: str,
-    month: str,
-    queue: str,
-) -> None:
-    cddata = ClimateDownscaleData(output_dir)
-    cred_path = cddata.credentials_root / "copernicus.yaml"
-    credentials = yaml.safe_load(cred_path.read_text())
-    users = list(credentials["keys"])
-    jobs_per_user = 20
-
-    datasets = (
-        clio.VALID_ERA5_DATASETS if era5_dataset == clio.RUN_ALL else [era5_dataset]
-    )
-    variables = (
-        clio.VALID_ERA5_VARIABLES if era5_variable == clio.RUN_ALL else [era5_variable]
-    )
-    years = clio.VALID_YEARS if year == clio.RUN_ALL else [year]
-    months = clio.VALID_MONTHS if month == clio.RUN_ALL else [month]
-
+def build_task_lists(
+    cddata: ClimateDownscaleData,
+    *spec_variables: list[str],
+) -> tuple[list[tuple[str, ...]], ...]:
     to_download = []
     to_compress = []
     complete = []
-    for spec in itertools.product(datasets, variables, years, months):
+    for spec in itertools.product(*spec_variables):
         final_out_path = cddata.extracted_era5_path(*spec)
         download_path, _ = get_download_spec(final_out_path)
 
@@ -239,15 +213,57 @@ def extract_era5(  # noqa: PLR0913
         elif download_path.exists():
             to_compress.append(spec)
         elif final_out_path.exists():
-            # We've already extracted this dataset (deleting the download path is the last step)
+            # We've already extracted this dataset
+            # (deleting the download path is the last step)
             complete.append(spec)
             continue
         else:
             to_download.append(spec)
             to_compress.append(spec)
 
+    return to_download, to_compress, complete
+
+
+@click.command()  # type: ignore[arg-type]
+@clio.with_output_directory(DEFAULT_ROOT)
+@clio.with_era5_dataset(allow_all=True)
+@clio.with_era5_variable(allow_all=True)
+@clio.with_year(years=clio.VALID_HISTORY_YEARS, allow_all=True)
+@clio.with_month(allow_all=True)
+@clio.with_queue()
+def extract_era5(
+    output_dir: str,
+    era5_dataset: str,
+    era5_variable: str,
+    year: str,
+    month: str,
+    queue: str,
+) -> None:
+    cddata = ClimateDownscaleData(output_dir)
+    cred_path = cddata.credentials_root / "copernicus.yaml"
+    credentials = yaml.safe_load(cred_path.read_text())
+    users = list(credentials["keys"])
+    jobs_per_user = 20
+
+    datasets = (
+        clio.VALID_ERA5_DATASETS if era5_dataset == clio.RUN_ALL else [era5_dataset]
+    )
+    variables = (
+        clio.VALID_ERA5_VARIABLES if era5_variable == clio.RUN_ALL else [era5_variable]
+    )
+    years = clio.VALID_HISTORY_YEARS if year == clio.RUN_ALL else [year]
+    months = clio.VALID_MONTHS if month == clio.RUN_ALL else [month]
+
+    to_download, to_compress, complete = build_task_lists(
+        cddata,
+        datasets,
+        variables,
+        years,
+        months,
+    )
+
     if not to_download:
-        print('No datasets to download')
+        print("No datasets to download")
 
     while to_download:
         downloads_left = len(to_download)
@@ -256,7 +272,7 @@ def extract_era5(  # noqa: PLR0913
         for _ in range(jobs_per_user):
             for user in users:
                 if to_download:
-                    download_batch.append((*to_download.pop(), user))
+                    download_batch.append((*to_download.pop(), user))  # noqa: PERF401
         if len(download_batch) != min(len(users) * jobs_per_user, downloads_left):
             msg = "Download batch size is incorrect"
             raise ValueError(msg)
@@ -289,7 +305,7 @@ def extract_era5(  # noqa: PLR0913
         )
 
     if not to_compress:
-        print('No datasets to compress.')
+        print("No datasets to compress.")
         return
 
     jobmon.run_parallel(
diff --git a/src/climate_downscale/generate/historical_daily.py b/src/climate_downscale/generate/historical_daily.py
index d1bcb2d..5c1dc6d 100644
--- a/src/climate_downscale/generate/historical_daily.py
+++ b/src/climate_downscale/generate/historical_daily.py
@@ -175,7 +175,7 @@ def generate_historical_daily_main(
 
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
-@clio.with_year()
+@clio.with_year(years=clio.VALID_HISTORY_YEARS)
 @with_target_variable()
 def generate_historical_daily_task(
     output_dir: str,
@@ -187,7 +187,7 @@ def generate_historical_daily_task(
 
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
-@clio.with_year(allow_all=True)
+@clio.with_year(years=clio.VALID_HISTORY_YEARS, allow_all=True)
 @with_target_variable(allow_all=True)
 @clio.with_queue()
 @clio.with_overwrite()
@@ -200,7 +200,7 @@ def generate_historical_daily(
 ) -> None:
     cd_data = ClimateDownscaleData(output_dir)
 
-    years = clio.VALID_YEARS if year == clio.RUN_ALL else [year]
+    years = clio.VALID_HISTORY_YEARS if year == clio.RUN_ALL else [year]
     variables = (
         list(TRANSFORM_MAP.keys())
         if target_variable == clio.RUN_ALL
diff --git a/src/climate_downscale/generate/historical_reference.py b/src/climate_downscale/generate/historical_reference.py
index bb29ea1..d2a3f87 100644
--- a/src/climate_downscale/generate/historical_reference.py
+++ b/src/climate_downscale/generate/historical_reference.py
@@ -4,7 +4,6 @@
 
 from climate_downscale import cli_options as clio
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
-from climate_downscale.generate import utils
 from climate_downscale.generate.historical_daily import (
     TRANSFORM_MAP,
     with_target_variable,
@@ -18,7 +17,7 @@ def generate_historical_reference_main(
     cd_data = ClimateDownscaleData(output_dir)
     paths = [
         cd_data.daily_results_path("historical", target_variable, year)
-        for year in utils.REFERENCE_YEARS
+        for year in clio.VALID_REFERENCE_YEARS
     ]
     print(f"Building reference data from: {len(paths)} files.")
 
diff --git a/src/climate_downscale/generate/scenario_annual.py b/src/climate_downscale/generate/scenario_annual.py
index bb32fbb..dd880f1 100644
--- a/src/climate_downscale/generate/scenario_annual.py
+++ b/src/climate_downscale/generate/scenario_annual.py
@@ -9,12 +9,7 @@
 from climate_downscale import cli_options as clio
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
 from climate_downscale.generate import utils
-from climate_downscale.generate.scenario_daily import VALID_YEARS
 
-YEARS = {
-    "historical": clio.VALID_YEARS,
-    "scenario": VALID_YEARS,
-}
 TEMP_THRESHOLDS = list(range(20, 35))
 
 
@@ -42,8 +37,11 @@ def __call__(self, *datasets: xr.Dataset) -> xr.Dataset:
 
     @property
     def encoding_kwargs(self) -> dict[str, float]:
-        if self.encoding_offset != 0. or self.encoding_scale != 1:
-            return {"add_offset": self.encoding_offset, "scale_factor": self.encoding_scale}
+        if self.encoding_offset != 0.0 or self.encoding_scale != 1:
+            return {
+                "add_offset": self.encoding_offset,
+                "scale_factor": self.encoding_scale,
+            }
         return {}
 
 
@@ -160,27 +158,29 @@ def generate_scenario_annual_main(
     scenario: str,
 ) -> None:
     cd_data = ClimateDownscaleData(output_dir)
-
     transform = TRANSFORM_MAP[target_variable]
 
-    
+    years = (
+        clio.VALID_HISTORY_YEARS
+        if scenario == "historical"
+        else clio.VALID_FORECAST_YEARS
+    )
+
     variables = []
     for source_variable in transform:
-        paths = []
-        for scenario_label, year_list in YEARS.items():
-            s = "historical" if scenario_label == "historical" else scenario
-            for year in year_list:            
-                paths.append(cd_data.daily_results_path(s, source_variable, year))
+        paths = [
+            cd_data.daily_results_path(scenario, source_variable, year)
+            for year in years
+        ]
         variables.append(
             xr.open_mfdataset(
-                paths, 
-                parallel=True, 
-                chunks={'date': -1, 'latitude': 601, 'longitude': 1200},
+                paths,
+                parallel=True,
+                chunks={"date": -1, "latitude": 601, "longitude": 1200},
             )
         )
     ds = transform(*variables).compute()
-    
-    
+
     cd_data.save_annual_results(
         ds,
         scenario=scenario,
@@ -192,7 +192,7 @@ def generate_scenario_annual_main(
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
 @with_target_variable()
-@clio.with_cmip6_experiment()
+@clio.with_cmip6_experiment(allow_historical=True)
 def generate_scenario_annual_task(
     output_dir: str,
     target_variable: str,
@@ -204,7 +204,7 @@ def generate_scenario_annual_task(
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
 @with_target_variable(allow_all=True)
-@clio.with_cmip6_experiment(allow_all=True)
+@clio.with_cmip6_experiment(allow_all=True, allow_historical=True)
 @clio.with_queue()
 @clio.with_overwrite()
 def generate_scenario_annual(
diff --git a/src/climate_downscale/generate/scenario_daily.py b/src/climate_downscale/generate/scenario_daily.py
index 6ec2b2a..c38b592 100644
--- a/src/climate_downscale/generate/scenario_daily.py
+++ b/src/climate_downscale/generate/scenario_daily.py
@@ -12,8 +12,6 @@
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
 from climate_downscale.generate import utils
 
-VALID_YEARS = [str(y) for y in range(max(utils.REFERENCE_YEARS) + 1, 2101)]
-
 # Map from source variable to a unit conversion function
 CONVERT_MAP = {
     "uas": utils.scale_wind_speed_height,
@@ -248,7 +246,7 @@ def generate_scenario_daily_main(  # noqa: C901, PLR0912, PLR0915
 
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
-@clio.with_year(years=VALID_YEARS)
+@clio.with_year(years=clio.VALID_FORECAST_YEARS)
 @with_target_variable()
 @clio.with_cmip6_experiment()
 def generate_scenario_daily_task(
@@ -259,7 +257,7 @@ def generate_scenario_daily_task(
 
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
-@clio.with_year(years=VALID_YEARS, allow_all=True)
+@clio.with_year(years=clio.VALID_FORECAST_YEARS, allow_all=True)
 @with_target_variable(allow_all=True)
 @clio.with_cmip6_experiment(allow_all=True)
 @clio.with_queue()
@@ -274,7 +272,7 @@ def generate_scenario_daily(
 ) -> None:
     cd_data = ClimateDownscaleData(output_dir)
 
-    years = VALID_YEARS if year == clio.RUN_ALL else [year]
+    years = clio.VALID_FORECAST_YEARS if year == clio.RUN_ALL else [year]
     variables = (
         list(TRANSFORM_MAP.keys())
         if target_variable == clio.RUN_ALL
diff --git a/src/climate_downscale/generate/utils.py b/src/climate_downscale/generate/utils.py
index e949f27..a2864c2 100644
--- a/src/climate_downscale/generate/utils.py
+++ b/src/climate_downscale/generate/utils.py
@@ -3,8 +3,12 @@
 import numpy as np
 import xarray as xr
 
-REFERENCE_YEARS = list(range(2018, 2024))
-REFERENCE_PERIOD = slice(f"{REFERENCE_YEARS[0]}-01-01", f"{REFERENCE_YEARS[-1]}-12-31")
+import climate_downscale.cli_options as clio
+
+REFERENCE_PERIOD = slice(
+    f"{clio.VALID_REFERENCE_YEARS[0]}-01-01",
+    f"{clio.VALID_REFERENCE_YEARS[-1]}-12-31",
+)
 TARGET_LON = xr.DataArray(
     np.round(np.arange(-180.0, 180.0, 0.1, dtype="float32"), 1), dims="longitude"
 )

From cd67a388f89d62a83c966dba0d1ba5d28850cb41 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Wed, 19 Jun 2024 09:38:25 -0700
Subject: [PATCH 62/71] Make scenario run by year

---
 src/climate_downscale/data.py                 |  9 ++-
 .../generate/scenario_annual.py               | 74 +++++++++----------
 2 files changed, 43 insertions(+), 40 deletions(-)

diff --git a/src/climate_downscale/data.py b/src/climate_downscale/data.py
index cffdf82..5bf53f7 100644
--- a/src/climate_downscale/data.py
+++ b/src/climate_downscale/data.py
@@ -163,17 +163,20 @@ def load_daily_results(
     def annual_results(self) -> Path:
         return self.results / "annual"
 
-    def annual_results_path(self, scenario: str, variable: str) -> Path:
-        return self.annual_results / scenario / f"{variable}.nc"
+    def annual_results_path(
+        self, scenario: str, variable: str, year: int | str
+    ) -> Path:
+        return self.annual_results / scenario / variable / f"{year}.nc"
 
     def save_annual_results(
         self,
         results_ds: xr.Dataset,
         scenario: str,
         variable: str,
+        year: int | str,
         encoding_kwargs: dict[str, Any],
     ) -> None:
-        path = self.annual_results_path(scenario, variable)
+        path = self.annual_results_path(scenario, variable, year)
         mkdir(path.parent, exist_ok=True, parents=True)
         touch(path, exist_ok=True)
 
diff --git a/src/climate_downscale/generate/scenario_annual.py b/src/climate_downscale/generate/scenario_annual.py
index dd880f1..300c014 100644
--- a/src/climate_downscale/generate/scenario_annual.py
+++ b/src/climate_downscale/generate/scenario_annual.py
@@ -153,38 +153,22 @@ def with_target_variable(
 
 
 def generate_scenario_annual_main(
-    output_dir: str | Path,
-    target_variable: str,
-    scenario: str,
+    output_dir: str | Path, target_variable: str, scenario: str, year: str
 ) -> None:
     cd_data = ClimateDownscaleData(output_dir)
     transform = TRANSFORM_MAP[target_variable]
 
-    years = (
-        clio.VALID_HISTORY_YEARS
-        if scenario == "historical"
-        else clio.VALID_FORECAST_YEARS
-    )
-
-    variables = []
-    for source_variable in transform:
-        paths = [
-            cd_data.daily_results_path(scenario, source_variable, year)
-            for year in years
+    ds = transform(
+        *[
+            xr.open_dataset(cd_data.daily_results_path(scenario, source_variable, year))
+            for source_variable in transform
         ]
-        variables.append(
-            xr.open_mfdataset(
-                paths,
-                parallel=True,
-                chunks={"date": -1, "latitude": 601, "longitude": 1200},
-            )
-        )
-    ds = transform(*variables).compute()
-
+    )
     cd_data.save_annual_results(
         ds,
         scenario=scenario,
         variable=target_variable,
+        year=year,
         encoding_kwargs=transform.encoding_kwargs,
     )
 
@@ -193,12 +177,24 @@ def generate_scenario_annual_main(
 @clio.with_output_directory(DEFAULT_ROOT)
 @with_target_variable()
 @clio.with_cmip6_experiment(allow_historical=True)
+@clio.with_year(years=clio.VALID_HISTORY_YEARS + clio.VALID_FORECAST_YEARS)
 def generate_scenario_annual_task(
     output_dir: str,
     target_variable: str,
     cmip6_experiment: str,
+    year: str,
 ) -> None:
-    generate_scenario_annual_main(output_dir, target_variable, cmip6_experiment)
+    if year in clio.VALID_HISTORY_YEARS and cmip6_experiment != "historical":
+        msg = "Historical years must use the 'historical' experiment."
+        raise ValueError(msg)
+    if year in clio.VALID_FORECAST_YEARS and cmip6_experiment == "historical":
+        msg = (
+            f"Forecast years must use a future experiment: "
+            f"{clio.VALID_CMIP6_EXPERIMENTS}."
+        )
+        raise ValueError(msg)
+
+    generate_scenario_annual_main(output_dir, target_variable, cmip6_experiment, year)
 
 
 @click.command()  # type: ignore[arg-type]
@@ -227,34 +223,38 @@ def generate_scenario_annual(
         else [cmip6_experiment]
     )
 
-    ve = []
+    vey = []
     complete = []
     for v, e in itertools.product(variables, experiments):
-        path = cd_data.annual_results_path(scenario=e, variable=v)
-        if not path.exists() or overwrite:
-            ve.append((v, e))
-        else:
-            complete.append((v, e))
+        year_list = (
+            clio.VALID_HISTORY_YEARS if e == "historical" else clio.VALID_FORECAST_YEARS
+        )
+        for y in year_list:
+            path = cd_data.annual_results_path(scenario=e, variable=v, year=y)
+            if not path.exists() or overwrite:
+                vey.append((v, e, y))
+            else:
+                complete.append((v, e, y))
 
-    print(f"{len(complete)} tasks already done. {len(ve)} tasks to do.")
-    if not ve:
+    print(f"{len(complete)} tasks already done. {len(vey)} tasks to do.")
+    if not vey:
         return
 
     jobmon.run_parallel(
         runner="cdtask",
         task_name="generate scenario_annual",
         flat_node_args=(
-            ("target-variable", "cmip6-experiment"),
-            ve,
+            ("target-variable", "cmip6-experiment", "year"),
+            vey,
         ),
         task_args={
             "output-dir": output_dir,
         },
         task_resources={
             "queue": queue,
-            "cores": 20,
-            "memory": "250G",
-            "runtime": "600m",
+            "cores": 2,
+            "memory": "100G",
+            "runtime": "120m",
             "project": "proj_rapidresponse",
         },
         max_attempts=1,

From f7e42fe597132e2ffa43e2a3214523b4783dbe4b Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Wed, 19 Jun 2024 12:56:50 -0700
Subject: [PATCH 63/71] Use transform class everywhere

---
 .../generate/derived_daily.py                 |   0
 .../generate/historical_daily.py              |  66 +++++-----
 .../generate/scenario_annual.py               |  65 +++------
 .../generate/scenario_daily.py                | 124 ++++++++++--------
 src/climate_downscale/generate/utils.py       |  30 +++++
 5 files changed, 153 insertions(+), 132 deletions(-)
 create mode 100644 src/climate_downscale/generate/derived_daily.py

diff --git a/src/climate_downscale/generate/derived_daily.py b/src/climate_downscale/generate/derived_daily.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/climate_downscale/generate/historical_daily.py b/src/climate_downscale/generate/historical_daily.py
index 5c1dc6d..a27e27c 100644
--- a/src/climate_downscale/generate/historical_daily.py
+++ b/src/climate_downscale/generate/historical_daily.py
@@ -32,35 +32,38 @@
 #  - a transformation function
 #  - a tuple of offset and scale factors for the output for serialization
 TRANSFORM_MAP = {
-    "mean_temperature": (
-        ["2m_temperature"],
-        utils.daily_mean,
-        (273.15, 0.01),
+    "mean_temperature": utils.Transform(
+        source_variables=["2m_temperature"],
+        transform_funcs=[utils.daily_mean],
+        encoding_scale=0.01,
+        encoding_offset=273.15,
     ),
-    "max_temperature": (
-        ["2m_temperature"],
-        utils.daily_max,
-        (273.15, 0.01),
+    "max_temperature": utils.Transform(
+        source_variables=["2m_temperature"],
+        transform_funcs=[utils.daily_max],
+        encoding_scale=0.01,
+        encoding_offset=273.15,
     ),
-    "min_temperature": (
-        ["2m_temperature"],
-        utils.daily_min,
-        (273.15, 0.01),
+    "min_temperature": utils.Transform(
+        source_variables=["2m_temperature"],
+        transform_funcs=[utils.daily_min],
+        encoding_scale=0.01,
+        encoding_offset=273.15,
     ),
-    "wind_speed": (
-        ["10m_u_component_of_wind", "10m_v_component_of_wind"],
-        lambda x, y: utils.daily_mean(utils.vector_magnitude(x, y)),
-        (0, 0.01),
+    "wind_speed": utils.Transform(
+        source_variables=["10m_u_component_of_wind", "10m_v_component_of_wind"],
+        transform_funcs=[utils.vector_magnitude, utils.daily_mean],
+        encoding_scale=0.01,
     ),
-    "relative_humidity": (
-        ["2m_temperature", "2m_dewpoint_temperature"],
-        lambda x, y: utils.daily_mean(utils.rh_percent(x, y)),
-        (0, 0.01),
+    "relative_humidity": utils.Transform(
+        source_variables=["2m_temperature", "2m_dewpoint_temperature"],
+        transform_funcs=[utils.rh_percent, utils.daily_mean],
+        encoding_scale=0.01,
     ),
-    "total_precipitation": (
-        ["total_precipitation"],
-        utils.daily_sum,
-        (0, 0.1),
+    "total_precipitation": utils.Transform(
+        source_variables=["total_precipitation"],
+        transform_funcs=[utils.daily_sum],
+        encoding_scale=0.1,
     ),
 }
 
@@ -128,17 +131,17 @@ def generate_historical_daily_main(
 ) -> None:
     cd_data = ClimateDownscaleData(output_dir)
 
-    source_variables, collapse_fun, (e_offset, e_scale) = TRANSFORM_MAP[target_variable]
+    transform = TRANSFORM_MAP[target_variable]
     datasets = []
     for month in range(1, 13):
         month_str = f"{month:02d}"
         print(f"loading single-levels for {month_str}")
         single_level = [
             load_variable(cd_data, sv, year, month_str, "single-levels")
-            for sv in source_variables
+            for sv in transform.source_variables
         ]
         print("collapsing")
-        ds = collapse_fun(*single_level).compute()  # type: ignore[operator]
+        ds = transform(*single_level).compute()
         # collapsing often screws the date dtype, so fix it
         ds = ds.assign(date=pd.to_datetime(ds.date))
 
@@ -148,11 +151,11 @@ def generate_historical_daily_main(
         print(f"loading land for {month_str}")
         land = [
             load_variable(cd_data, sv, year, month_str, "land")
-            for sv in source_variables
+            for sv in transform.source_variables
         ]
         print("collapsing")
         with dask.config.set(**{"array.slicing.split_large_chunks": False}):  # type: ignore[arg-type]
-            ds_land = collapse_fun(*land).compute()  # type: ignore[operator]
+            ds_land = transform(*land).compute()
         ds_land = ds_land.assign(date=pd.to_datetime(ds_land.date))
 
         print("combining")
@@ -166,10 +169,7 @@ def generate_historical_daily_main(
         scenario="historical",
         variable=target_variable,
         year=year,
-        encoding_kwargs={
-            "add_offset": e_offset,
-            "scale_factor": e_scale,
-        },
+        encoding_kwargs=transform.encoding_kwargs,
     )
 
 
diff --git a/src/climate_downscale/generate/scenario_annual.py b/src/climate_downscale/generate/scenario_annual.py
index 300c014..0457ba3 100644
--- a/src/climate_downscale/generate/scenario_annual.py
+++ b/src/climate_downscale/generate/scenario_annual.py
@@ -13,69 +13,40 @@
 TEMP_THRESHOLDS = list(range(20, 35))
 
 
-class Transform:
-    def __init__(
-        self,
-        source_variables: list[str],
-        transform_funcs: list[typing.Callable[..., xr.Dataset]] = [utils.annual_mean],  # noqa: B006
-        encoding_scale: float = 1.0,
-        encoding_offset: float = 0.0,
-    ):
-        self.source_variables = source_variables
-        self.transform_funcs = transform_funcs
-        self.encoding_scale = encoding_scale
-        self.encoding_offset = encoding_offset
-
-    def __iter__(self) -> typing.Iterator[str]:
-        return iter(self.source_variables)
-
-    def __call__(self, *datasets: xr.Dataset) -> xr.Dataset:
-        res = self.transform_funcs[0](*datasets)
-        for transform_func in self.transform_funcs[1:]:
-            res = transform_func(res)
-        return res
-
-    @property
-    def encoding_kwargs(self) -> dict[str, float]:
-        if self.encoding_offset != 0.0 or self.encoding_scale != 1:
-            return {
-                "add_offset": self.encoding_offset,
-                "scale_factor": self.encoding_scale,
-            }
-        return {}
-
-
 TRANSFORM_MAP = {
-    "mean_temperature": Transform(
+    "mean_temperature": utils.Transform(
         source_variables=["mean_temperature"],
+        transform_funcs=[utils.annual_mean],
         encoding_scale=0.01,
         encoding_offset=273.15,
     ),
-    "mean_high_temperature": Transform(
+    "mean_high_temperature": utils.Transform(
         source_variables=["max_temperature"],
+        transform_funcs=[utils.annual_mean],
         encoding_scale=0.01,
         encoding_offset=273.15,
     ),
-    "mean_low_temperature": Transform(
+    "mean_low_temperature": utils.Transform(
         source_variables=["min_temperature"],
+        transform_funcs=[utils.annual_mean],
         encoding_scale=0.01,
         encoding_offset=273.15,
     ),
     **{
-        f"days_over_{temp}C": Transform(
+        f"days_over_{temp}C": utils.Transform(
             source_variables=["mean_temperature"],
             transform_funcs=[utils.count_threshold(temp), utils.annual_sum],
         )
         for temp in TEMP_THRESHOLDS
     },
-    "mean_heat_index": Transform(
+    "mean_heat_index": utils.Transform(
         source_variables=["mean_temperature", "relative_humidity"],
         transform_funcs=[utils.heat_index, utils.annual_mean],
         encoding_scale=0.01,
         encoding_offset=273.15,
     ),
     **{
-        f"days_over_{temp}C_heat_index": Transform(
+        f"days_over_{temp}C_heat_index": utils.Transform(
             source_variables=["mean_temperature", "relative_humidity"],
             transform_funcs=[
                 utils.heat_index,
@@ -85,14 +56,14 @@ def encoding_kwargs(self) -> dict[str, float]:
         )
         for temp in TEMP_THRESHOLDS
     },
-    "mean_humidex": Transform(
+    "mean_humidex": utils.Transform(
         source_variables=["mean_temperature", "relative_humidity"],
         transform_funcs=[utils.humidex, utils.annual_mean],
         encoding_scale=0.01,
         encoding_offset=273.15,
     ),
     **{
-        f"days_over_{temp}C_humidex": Transform(
+        f"days_over_{temp}C_humidex": utils.Transform(
             source_variables=["mean_temperature", "relative_humidity"],
             transform_funcs=[
                 utils.humidex,
@@ -102,14 +73,14 @@ def encoding_kwargs(self) -> dict[str, float]:
         )
         for temp in TEMP_THRESHOLDS
     },
-    "mean_effective_temperature": Transform(
+    "mean_effective_temperature": utils.Transform(
         source_variables=["mean_temperature", "relative_humidity", "wind_speed"],
         transform_funcs=[utils.effective_temperature, utils.annual_mean],
         encoding_scale=0.01,
         encoding_offset=273.15,
     ),
     **{
-        f"days_over_{temp}C_effective_temperature": Transform(
+        f"days_over_{temp}C_effective_temperature": utils.Transform(
             source_variables=["mean_temperature", "relative_humidity", "wind_speed"],
             transform_funcs=[
                 utils.effective_temperature,
@@ -119,15 +90,17 @@ def encoding_kwargs(self) -> dict[str, float]:
         )
         for temp in TEMP_THRESHOLDS
     },
-    "wind_speed": Transform(
+    "wind_speed": utils.Transform(
         source_variables=["wind_speed"],
+        transform_funcs=[utils.annual_mean],
         encoding_scale=0.01,
     ),
-    "relative_humidity": Transform(
+    "relative_humidity": utils.Transform(
         source_variables=["relative_humidity"],
+        transform_funcs=[utils.annual_mean],
         encoding_scale=0.01,
     ),
-    "total_precipitation": Transform(
+    "total_precipitation": utils.Transform(
         source_variables=["total_precipitation"],
         transform_funcs=[utils.annual_sum],
         encoding_scale=0.1,
@@ -161,7 +134,7 @@ def generate_scenario_annual_main(
     ds = transform(
         *[
             xr.open_dataset(cd_data.daily_results_path(scenario, source_variable, year))
-            for source_variable in transform
+            for source_variable in transform.source_variables
         ]
     )
     cd_data.save_annual_results(
diff --git a/src/climate_downscale/generate/scenario_daily.py b/src/climate_downscale/generate/scenario_daily.py
index c38b592..56081b5 100644
--- a/src/climate_downscale/generate/scenario_daily.py
+++ b/src/climate_downscale/generate/scenario_daily.py
@@ -28,41 +28,56 @@
 #  - a transformation function
 #  - a tuple of offset and scale factors for the output for serialization
 #  - an anomaly type
-TRANSFORM_MAP = {
+TRANSFORM_MAP: dict[str, tuple[utils.Transform, str]] = {
     "mean_temperature": (
-        ["tas"],
-        utils.identity,
-        (273.15, 0.01),
+        utils.Transform(
+            source_variables=["tas"],
+            transform_funcs=[utils.identity],
+            encoding_scale=0.01,
+            encoding_offset=273.15,
+        ),
         "additive",
     ),
     "max_temperature": (
-        ["tasmax"],
-        utils.identity,
-        (273.15, 0.01),
+        utils.Transform(
+            source_variables=["tasmax"],
+            transform_funcs=[utils.identity],
+            encoding_scale=0.01,
+            encoding_offset=273.15,
+        ),
         "additive",
     ),
     "min_temperature": (
-        ["tasmin"],
-        utils.identity,
-        (273.15, 0.01),
+        utils.Transform(
+            source_variables=["tasmin"],
+            transform_funcs=[utils.identity],
+            encoding_scale=0.01,
+            encoding_offset=273.15,
+        ),
         "additive",
     ),
     "wind_speed": (
-        ["uas", "vas"],
-        utils.vector_magnitude,
-        (0, 0.01),
+        utils.Transform(
+            source_variables=["uas", "vas"],
+            transform_funcs=[utils.vector_magnitude],
+            encoding_scale=0.01,
+        ),
         "multiplicative",
     ),
     "relative_humidity": (
-        ["hurs"],
-        utils.identity,
-        (0, 0.01),
+        utils.Transform(
+            source_variables=["hurs"],
+            transform_funcs=[utils.identity],
+            encoding_scale=0.01,
+        ),
         "multiplicative",
     ),
     "total_precipitation": (
-        ["pr"],
-        utils.identity,
-        (0, 0.1),
+        utils.Transform(
+            source_variables=["pr"],
+            transform_funcs=[utils.identity],
+            encoding_scale=0.1,
+        ),
         "multiplicative",
     ),
 }
@@ -85,6 +100,36 @@ def with_target_variable(
     )
 
 
+def get_source_paths(
+    cd_data: ClimateDownscaleData,
+    source_variables: list[str],
+    cmip6_experiment: str,
+) -> list[list[Path]]:
+    models_by_var = {}
+    for source_variable in source_variables:
+        model_vars = {
+            p.stem.split(f"{cmip6_experiment}_")[1]
+            for p in cd_data.extracted_cmip6.glob(
+                f"{source_variable}_{cmip6_experiment}*.nc"
+            )
+        }
+        models_by_var[source_variable] = model_vars
+
+    shared_models = set.intersection(*models_by_var.values())
+    for var, models in models_by_var.items():
+        extra_models = models.difference(shared_models)
+        if extra_models:
+            print(var, extra_models)
+    source_paths = [
+        [
+            cd_data.extracted_cmip6 / f"{source_variable}_{cmip6_experiment}_{model}.nc"
+            for source_variable in source_variables
+        ]
+        for model in sorted(shared_models)
+    ]
+    return source_paths
+
+
 def load_and_shift_longitude(
     ds_path: str | Path,
     time_slice: slice,
@@ -140,7 +185,7 @@ def compute_anomaly(
     return anomaly
 
 
-def generate_scenario_daily_main(  # noqa: C901, PLR0912, PLR0915
+def generate_scenario_daily_main(  # noqa: PLR0912
     output_dir: str | Path,
     year: str | int,
     target_variable: str,
@@ -148,33 +193,11 @@ def generate_scenario_daily_main(  # noqa: C901, PLR0912, PLR0915
 ) -> None:
     cd_data = ClimateDownscaleData(output_dir)
 
-    (source_variables, transform_fun, (e_offset, e_scale), anomaly_type) = (
-        TRANSFORM_MAP[target_variable]
+    transform, anomaly_type = TRANSFORM_MAP[target_variable]
+    source_paths = get_source_paths(
+        cd_data, transform.source_variables, cmip6_experiment
     )
 
-    models_by_var = {}
-    for source_variable in source_variables:
-        model_vars = {
-            p.stem.split(f"{cmip6_experiment}_")[1]
-            for p in cd_data.extracted_cmip6.glob(
-                f"{source_variable}_{cmip6_experiment}*.nc"
-            )
-        }
-        models_by_var[source_variable] = model_vars
-
-    shared_models = set.intersection(*models_by_var.values())
-    for var, models in models_by_var.items():
-        extra_models = models.difference(shared_models)
-        if extra_models:
-            print(var, extra_models)
-    source_paths = [
-        [
-            cd_data.extracted_cmip6 / f"{source_variable}_{cmip6_experiment}_{model}.nc"
-            for source_variable in source_variables
-        ]
-        for model in sorted(shared_models)
-    ]
-
     print("loading historical reference")
     historical_reference = cd_data.load_daily_results(
         scenario="historical",
@@ -187,13 +210,11 @@ def generate_scenario_daily_main(  # noqa: C901, PLR0912, PLR0915
         pid = f"{i+1}/{len(source_paths)} {sps[0].stem}"
         print(f"{pid}: Loading reference")
         try:
-            scenario_reference = transform_fun(  # type: ignore[operator]
+            scenario_reference = transform(
                 *[load_variable(sp, "reference") for sp in sps]
             )
             print(f"{pid}: Loading target")
-            target = transform_fun(  # type: ignore[operator]
-                *[load_variable(sp, year) for sp in sps]
-            )
+            target = transform(*[load_variable(sp, year) for sp in sps])
         except KeyError:
             print(f"{pid}: Bad formatting, skipping...")
             continue
@@ -237,10 +258,7 @@ def generate_scenario_daily_main(  # noqa: C901, PLR0912, PLR0915
         scenario=cmip6_experiment,
         variable=target_variable,
         year=year,
-        encoding_kwargs={
-            "add_offset": e_offset,
-            "scale_factor": e_scale,
-        },
+        encoding_kwargs=transform.encoding_kwargs,
     )
 
 
diff --git a/src/climate_downscale/generate/utils.py b/src/climate_downscale/generate/utils.py
index a2864c2..7a77713 100644
--- a/src/climate_downscale/generate/utils.py
+++ b/src/climate_downscale/generate/utils.py
@@ -1,3 +1,4 @@
+import typing
 from collections.abc import Callable
 
 import numpy as np
@@ -325,3 +326,32 @@ def interpolate_to_target_latlon(
         .interpolate_na(dim="latitude", method="nearest", fill_value="extrapolate")
         .sortby("latitude", ascending=False)
     )
+
+
+class Transform:
+    def __init__(
+        self,
+        source_variables: list[str],
+        transform_funcs: list[typing.Callable[..., xr.Dataset]],
+        encoding_scale: float = 1.0,
+        encoding_offset: float = 0.0,
+    ):
+        self.source_variables = source_variables
+        self.transform_funcs = transform_funcs
+        self.encoding_scale = encoding_scale
+        self.encoding_offset = encoding_offset
+
+    def __call__(self, *datasets: xr.Dataset) -> xr.Dataset:
+        res = self.transform_funcs[0](*datasets)
+        for transform_func in self.transform_funcs[1:]:
+            res = transform_func(res)
+        return res
+
+    @property
+    def encoding_kwargs(self) -> dict[str, float]:
+        if self.encoding_offset != 0.0 or self.encoding_scale != 1:
+            return {
+                "add_offset": self.encoding_offset,
+                "scale_factor": self.encoding_scale,
+            }
+        return {}

From 1611af2ef35ad4f36850cea335eda6dcd4a5d19f Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Wed, 19 Jun 2024 13:02:14 -0700
Subject: [PATCH 64/71] pullback with_target_variable

---
 src/climate_downscale/cli_options.py          | 14 ++++++++++++
 .../generate/historical_daily.py              | 21 ++----------------
 .../generate/historical_reference.py          |  5 ++---
 .../generate/scenario_annual.py               | 22 ++-----------------
 .../generate/scenario_daily.py                | 22 ++-----------------
 5 files changed, 22 insertions(+), 62 deletions(-)

diff --git a/src/climate_downscale/cli_options.py b/src/climate_downscale/cli_options.py
index 1612f6e..b1888da 100644
--- a/src/climate_downscale/cli_options.py
+++ b/src/climate_downscale/cli_options.py
@@ -172,6 +172,20 @@ def with_cmip6_variable(
     )
 
 
+def with_target_variable(
+    *,
+    variable_names: list[str],
+    allow_all: bool = False,
+) -> ClickOption[_P, _T]:
+    return with_choice(
+        "target-variable",
+        "t",
+        allow_all=allow_all,
+        choices=variable_names,
+        help="Variable to generate.",
+    )
+
+
 STRIDE = 30
 LATITUDES = [str(lat) for lat in range(-90, 90, STRIDE)]
 LONGITUDES = [str(lon) for lon in range(-180, 180, STRIDE)]
diff --git a/src/climate_downscale/generate/historical_daily.py b/src/climate_downscale/generate/historical_daily.py
index a27e27c..5eeddde 100644
--- a/src/climate_downscale/generate/historical_daily.py
+++ b/src/climate_downscale/generate/historical_daily.py
@@ -1,5 +1,4 @@
 import itertools
-import typing
 from pathlib import Path
 
 import click
@@ -67,22 +66,6 @@
     ),
 }
 
-_P = typing.ParamSpec("_P")
-_T = typing.TypeVar("_T")
-
-
-def with_target_variable(
-    *,
-    allow_all: bool = False,
-) -> clio.ClickOption[_P, _T]:
-    return clio.with_choice(
-        "target-variable",
-        "t",
-        allow_all=allow_all,
-        choices=list(TRANSFORM_MAP.keys()),
-        help="Variable to generate.",
-    )
-
 
 def load_and_shift_longitude(ds_path: str | Path) -> xr.Dataset:
     ds = xr.open_dataset(ds_path).chunk(time=24)
@@ -176,7 +159,7 @@ def generate_historical_daily_main(
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
 @clio.with_year(years=clio.VALID_HISTORY_YEARS)
-@with_target_variable()
+@clio.with_target_variable(variable_names=list(TRANSFORM_MAP))
 def generate_historical_daily_task(
     output_dir: str,
     year: str,
@@ -188,7 +171,7 @@ def generate_historical_daily_task(
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
 @clio.with_year(years=clio.VALID_HISTORY_YEARS, allow_all=True)
-@with_target_variable(allow_all=True)
+@clio.with_target_variable(variable_names=list(TRANSFORM_MAP), allow_all=True)
 @clio.with_queue()
 @clio.with_overwrite()
 def generate_historical_daily(
diff --git a/src/climate_downscale/generate/historical_reference.py b/src/climate_downscale/generate/historical_reference.py
index d2a3f87..de030a2 100644
--- a/src/climate_downscale/generate/historical_reference.py
+++ b/src/climate_downscale/generate/historical_reference.py
@@ -6,7 +6,6 @@
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
 from climate_downscale.generate.historical_daily import (
     TRANSFORM_MAP,
-    with_target_variable,
 )
 
 
@@ -54,7 +53,7 @@ def generate_historical_reference_main(
 
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
-@with_target_variable()
+@clio.with_target_variable(variable_names=list(TRANSFORM_MAP))
 def generate_historical_reference_task(
     output_dir: str,
     target_variable: str,
@@ -64,7 +63,7 @@ def generate_historical_reference_task(
 
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
-@with_target_variable(allow_all=True)
+@clio.with_target_variable(variable_names=list(TRANSFORM_MAP))
 @clio.with_queue()
 def generate_historical_reference(
     output_dir: str,
diff --git a/src/climate_downscale/generate/scenario_annual.py b/src/climate_downscale/generate/scenario_annual.py
index 0457ba3..859185b 100644
--- a/src/climate_downscale/generate/scenario_annual.py
+++ b/src/climate_downscale/generate/scenario_annual.py
@@ -1,5 +1,4 @@
 import itertools
-import typing
 from pathlib import Path
 
 import click
@@ -108,23 +107,6 @@
 }
 
 
-_P = typing.ParamSpec("_P")
-_T = typing.TypeVar("_T")
-
-
-def with_target_variable(
-    *,
-    allow_all: bool = False,
-) -> clio.ClickOption[_P, _T]:
-    return clio.with_choice(
-        "target-variable",
-        "t",
-        allow_all=allow_all,
-        choices=list(TRANSFORM_MAP.keys()),
-        help="Variable to generate.",
-    )
-
-
 def generate_scenario_annual_main(
     output_dir: str | Path, target_variable: str, scenario: str, year: str
 ) -> None:
@@ -148,7 +130,7 @@ def generate_scenario_annual_main(
 
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
-@with_target_variable()
+@clio.with_target_variable(variable_names=list(TRANSFORM_MAP))
 @clio.with_cmip6_experiment(allow_historical=True)
 @clio.with_year(years=clio.VALID_HISTORY_YEARS + clio.VALID_FORECAST_YEARS)
 def generate_scenario_annual_task(
@@ -172,7 +154,7 @@ def generate_scenario_annual_task(
 
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
-@with_target_variable(allow_all=True)
+@clio.with_target_variable(variable_names=list(TRANSFORM_MAP), allow_all=True)
 @clio.with_cmip6_experiment(allow_all=True, allow_historical=True)
 @clio.with_queue()
 @clio.with_overwrite()
diff --git a/src/climate_downscale/generate/scenario_daily.py b/src/climate_downscale/generate/scenario_daily.py
index 56081b5..12cd391 100644
--- a/src/climate_downscale/generate/scenario_daily.py
+++ b/src/climate_downscale/generate/scenario_daily.py
@@ -1,5 +1,4 @@
 import itertools
-import typing
 from pathlib import Path
 
 import click
@@ -83,23 +82,6 @@
 }
 
 
-_P = typing.ParamSpec("_P")
-_T = typing.TypeVar("_T")
-
-
-def with_target_variable(
-    *,
-    allow_all: bool = False,
-) -> clio.ClickOption[_P, _T]:
-    return clio.with_choice(
-        "target-variable",
-        "t",
-        allow_all=allow_all,
-        choices=list(TRANSFORM_MAP.keys()),
-        help="Variable to generate.",
-    )
-
-
 def get_source_paths(
     cd_data: ClimateDownscaleData,
     source_variables: list[str],
@@ -265,7 +247,7 @@ def generate_scenario_daily_main(  # noqa: PLR0912
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
 @clio.with_year(years=clio.VALID_FORECAST_YEARS)
-@with_target_variable()
+@clio.with_target_variable(variable_names=list(TRANSFORM_MAP))
 @clio.with_cmip6_experiment()
 def generate_scenario_daily_task(
     output_dir: str, year: str, target_variable: str, cmip6_experiment: str
@@ -276,7 +258,7 @@ def generate_scenario_daily_task(
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
 @clio.with_year(years=clio.VALID_FORECAST_YEARS, allow_all=True)
-@with_target_variable(allow_all=True)
+@clio.with_target_variable(variable_names=list(TRANSFORM_MAP), allow_all=True)
 @clio.with_cmip6_experiment(allow_all=True)
 @clio.with_queue()
 @clio.with_overwrite()

From 62935665f6bfd6d25f32987909ef8108e6ef06b1 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Wed, 19 Jun 2024 13:45:46 -0700
Subject: [PATCH 65/71] Add script to generate derived daily variables

---
 src/climate_downscale/generate/__init__.py    |   6 +
 .../generate/derived_daily.py                 | 140 ++++++++++++++++++
 2 files changed, 146 insertions(+)

diff --git a/src/climate_downscale/generate/__init__.py b/src/climate_downscale/generate/__init__.py
index 4f4afa8..ded9def 100644
--- a/src/climate_downscale/generate/__init__.py
+++ b/src/climate_downscale/generate/__init__.py
@@ -1,3 +1,7 @@
+from climate_downscale.generate.derived_daily import (
+    generate_derived_daily,
+    generate_derived_daily_task,
+)
 from climate_downscale.generate.historical_daily import (
     generate_historical_daily,
     generate_historical_daily_task,
@@ -19,6 +23,7 @@
     "historical_daily": generate_historical_daily,
     "historical_reference": generate_historical_reference,
     "scenario_daily": generate_scenario_daily,
+    "derived_daily": generate_derived_daily,
     "scenario_annual": generate_scenario_annual,
 }
 
@@ -26,5 +31,6 @@
     "historical_daily": generate_historical_daily_task,
     "historical_reference": generate_historical_reference_task,
     "scenario_daily": generate_scenario_daily_task,
+    "derived_daily": generate_derived_daily_task,
     "scenario_annual": generate_scenario_annual_task,
 }
diff --git a/src/climate_downscale/generate/derived_daily.py b/src/climate_downscale/generate/derived_daily.py
index e69de29..6df575d 100644
--- a/src/climate_downscale/generate/derived_daily.py
+++ b/src/climate_downscale/generate/derived_daily.py
@@ -0,0 +1,140 @@
+import itertools
+
+import click
+from rra_tools import jobmon
+
+from climate_downscale import cli_options as clio
+from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
+from climate_downscale.generate import utils
+
+TRANSFORM_MAP = {
+    "heat_index": utils.Transform(
+        source_variables=["mean_temperature", "relative_humidity"],
+        transform_funcs=[utils.heat_index],
+        encoding_scale=0.01,
+        encoding_offset=273.15,
+    ),
+    "humidex": utils.Transform(
+        source_variables=["mean_temperature", "relative_humidity"],
+        transform_funcs=[utils.humidex],
+        encoding_scale=0.01,
+        encoding_offset=273.15,
+    ),
+    "effective_temperature": utils.Transform(
+        source_variables=["mean_temperature", "relative_humidity", "wind_speed"],
+        transform_funcs=[utils.effective_temperature],
+        encoding_scale=0.01,
+        encoding_offset=273.15,
+    ),
+}
+
+
+def generate_derived_daily_main(
+    output_dir: str,
+    target_variable: str,
+    scenario: str,
+    year: str,
+) -> None:
+    cd_data = ClimateDownscaleData(output_dir)
+    transform = TRANSFORM_MAP[target_variable]
+
+    ds = transform(
+        *[
+            cd_data.load_daily_results(scenario, source_variable, year)
+            for source_variable in transform.source_variables
+        ]
+    )
+    cd_data.save_daily_results(
+        ds,
+        scenario=scenario,
+        variable=target_variable,
+        year=year,
+        encoding_kwargs=transform.encoding_kwargs,
+    )
+
+
+@click.command()  # type: ignore[arg-type]
+@clio.with_output_directory(DEFAULT_ROOT)
+@clio.with_target_variable(variable_names=list(TRANSFORM_MAP))
+@clio.with_cmip6_experiment(allow_historical=True)
+@clio.with_year(years=clio.VALID_HISTORY_YEARS + clio.VALID_FORECAST_YEARS)
+def generate_derived_daily_task(
+    output_dir: str,
+    target_variable: str,
+    cmip6_experiment: str,
+    year: str,
+) -> None:
+    if year in clio.VALID_HISTORY_YEARS and cmip6_experiment != "historical":
+        msg = "Historical years must use the 'historical' experiment."
+        raise ValueError(msg)
+    if year in clio.VALID_FORECAST_YEARS and cmip6_experiment == "historical":
+        msg = (
+            f"Forecast years must use a future experiment: "
+            f"{clio.VALID_CMIP6_EXPERIMENTS}."
+        )
+        raise ValueError(msg)
+    generate_derived_daily_main(output_dir, target_variable, cmip6_experiment, year)
+
+
+@click.command()  # type: ignore[arg-type]
+@clio.with_output_directory(DEFAULT_ROOT)
+@clio.with_target_variable(variable_names=list(TRANSFORM_MAP), allow_all=True)
+@clio.with_cmip6_experiment(allow_all=True, allow_historical=True)
+@clio.with_queue()
+@clio.with_overwrite()
+def generate_derived_daily(
+    output_dir: str,
+    target_variable: str,
+    cmip6_experiment: str,
+    queue: str,
+    overwrite: bool,  # noqa: FBT001
+) -> None:
+    cd_data = ClimateDownscaleData(output_dir)
+
+    variables = (
+        list(TRANSFORM_MAP.keys())
+        if target_variable == clio.RUN_ALL
+        else [target_variable]
+    )
+    experiments = (
+        list(clio.VALID_CMIP6_EXPERIMENTS)
+        if cmip6_experiment == clio.RUN_ALL
+        else [cmip6_experiment]
+    )
+
+    vey = []
+    complete = []
+    for v, e in itertools.product(variables, experiments):
+        year_list = (
+            clio.VALID_HISTORY_YEARS if e == "historical" else clio.VALID_FORECAST_YEARS
+        )
+        for y in year_list:
+            path = cd_data.annual_results_path(scenario=e, variable=v, year=y)
+            if not path.exists() or overwrite:
+                vey.append((v, e, y))
+            else:
+                complete.append((v, e, y))
+
+    print(f"{len(complete)} tasks already done. {len(vey)} tasks to do.")
+    if not vey:
+        return
+
+    jobmon.run_parallel(
+        runner="cdtask",
+        task_name="generate derived_daily",
+        flat_node_args=(
+            ("target-variable", "cmip6-experiment", "year"),
+            vey,
+        ),
+        task_args={
+            "output-dir": output_dir,
+        },
+        task_resources={
+            "queue": queue,
+            "cores": 2,
+            "memory": "100G",
+            "runtime": "120m",
+            "project": "proj_rapidresponse",
+        },
+        max_attempts=1,
+    )

From b1f5f12f92f6e2d4dfd85dc072fd6f0a60ba6201 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Sun, 7 Jul 2024 11:52:24 -0700
Subject: [PATCH 66/71] Add readme for pipline stages

---
 src/climate_downscale/generate/README.md | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 src/climate_downscale/generate/README.md

diff --git a/src/climate_downscale/generate/README.md b/src/climate_downscale/generate/README.md
new file mode 100644
index 0000000..b52eda9
--- /dev/null
+++ b/src/climate_downscale/generate/README.md
@@ -0,0 +1,20 @@
+# Climate Variable Pipeline
+
+This set of scripts processes ERA5 and CMIP6 climate data into a database of
+climate variables at a consistent resolution and format. The pipeline is
+run in several stages:
+
+1.  Historical Daily: This processes the hourly ERA5-Land and ERA5-Single-Level
+    data into a unified daily format, pulling the higher-resolution ERA5-Land data
+    where available and filling in with ERA5-Single-Level data.
+2.  Historical Reference: This produces a set of reference climatologies from this
+    historical daily results by averaging, by month and space, over a historical
+    reference period. This is used to downscale and bias-correct the CMIP6 data.
+3.  Scenario Daily: This produces scenario projections from the CMIP6 data, ensembling
+    over a curated set of GCMs and using the historical reference climatologies to
+    bias-correct the data.
+4.  Derived Daily: This produces derived climate variables from the daily data, such as
+    humidex and effective temperature. This writes results to the same directories
+    as the daily data.
+5.  Scenario Annual: This produces annualized summaries of the scenario data, such as
+    annual averages and extremes.

From e5a136acb51a3d513fa04c963f81f6bf2c45eb45 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Mon, 8 Jul 2024 11:49:22 -0700
Subject: [PATCH 67/71] Fix derived climate variables

---
 .../generate/derived_daily.py                 | 30 ++++++++++++----
 .../generate/scenario_annual.py               | 34 +++++++++++--------
 2 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/src/climate_downscale/generate/derived_daily.py b/src/climate_downscale/generate/derived_daily.py
index 6df575d..1a4735b 100644
--- a/src/climate_downscale/generate/derived_daily.py
+++ b/src/climate_downscale/generate/derived_daily.py
@@ -2,6 +2,7 @@
 
 import click
 from rra_tools import jobmon
+from dask.diagnostics import ProgressBar
 
 from climate_downscale import cli_options as clio
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
@@ -34,16 +35,27 @@ def generate_derived_daily_main(
     target_variable: str,
     scenario: str,
     year: str,
+    progress_bar: bool = False
 ) -> None:
     cd_data = ClimateDownscaleData(output_dir)
     transform = TRANSFORM_MAP[target_variable]
 
+    # Empirically tested to find a good balance between 
+    # runtime and memory usage for data at this scale.
+    chunks = {"latitude": -1, "longitude": -1, "date": 20}
+
     ds = transform(
         *[
-            cd_data.load_daily_results(scenario, source_variable, year)
+            cd_data.load_daily_results(scenario, source_variable, year).chunk(**chunks)
             for source_variable in transform.source_variables
         ]
     )
+    if progress_bar:
+        with ProgressBar():
+            ds = ds.compute()
+    else:
+        ds = ds.compute()
+        
     cd_data.save_daily_results(
         ds,
         scenario=scenario,
@@ -97,7 +109,7 @@ def generate_derived_daily(
         else [target_variable]
     )
     experiments = (
-        list(clio.VALID_CMIP6_EXPERIMENTS)
+        clio.VALID_CMIP6_EXPERIMENTS + ['historical']
         if cmip6_experiment == clio.RUN_ALL
         else [cmip6_experiment]
     )
@@ -109,11 +121,15 @@ def generate_derived_daily(
             clio.VALID_HISTORY_YEARS if e == "historical" else clio.VALID_FORECAST_YEARS
         )
         for y in year_list:
-            path = cd_data.annual_results_path(scenario=e, variable=v, year=y)
+            path = cd_data.daily_results_path(scenario=e, variable=v, year=y)
+            if path.exists() and path.stat().st_size == 0:
+                # job failed when writing, delete the file
+                path.unlink()
+                
             if not path.exists() or overwrite:
                 vey.append((v, e, y))
             else:
-                complete.append((v, e, y))
+                complete.append((v, e, y))    
 
     print(f"{len(complete)} tasks already done. {len(vey)} tasks to do.")
     if not vey:
@@ -131,9 +147,9 @@ def generate_derived_daily(
         },
         task_resources={
             "queue": queue,
-            "cores": 2,
-            "memory": "100G",
-            "runtime": "120m",
+            "cores": 8,
+            "memory": "150G",
+            "runtime": "45m",
             "project": "proj_rapidresponse",
         },
         max_attempts=1,
diff --git a/src/climate_downscale/generate/scenario_annual.py b/src/climate_downscale/generate/scenario_annual.py
index 859185b..83ceab9 100644
--- a/src/climate_downscale/generate/scenario_annual.py
+++ b/src/climate_downscale/generate/scenario_annual.py
@@ -39,16 +39,15 @@
         for temp in TEMP_THRESHOLDS
     },
     "mean_heat_index": utils.Transform(
-        source_variables=["mean_temperature", "relative_humidity"],
-        transform_funcs=[utils.heat_index, utils.annual_mean],
+        source_variables=["heat_index"],
+        transform_funcs=[utils.annual_mean],
         encoding_scale=0.01,
         encoding_offset=273.15,
     ),
     **{
         f"days_over_{temp}C_heat_index": utils.Transform(
-            source_variables=["mean_temperature", "relative_humidity"],
+            source_variables=["heat_index"],
             transform_funcs=[
-                utils.heat_index,
                 utils.count_threshold(temp),
                 utils.annual_sum,
             ],
@@ -56,16 +55,15 @@
         for temp in TEMP_THRESHOLDS
     },
     "mean_humidex": utils.Transform(
-        source_variables=["mean_temperature", "relative_humidity"],
-        transform_funcs=[utils.humidex, utils.annual_mean],
+        source_variables=["humidex"],
+        transform_funcs=[utils.annual_mean],
         encoding_scale=0.01,
         encoding_offset=273.15,
     ),
     **{
         f"days_over_{temp}C_humidex": utils.Transform(
-            source_variables=["mean_temperature", "relative_humidity"],
+            source_variables=["humidex"],
             transform_funcs=[
-                utils.humidex,
                 utils.count_threshold(temp),
                 utils.annual_sum,
             ],
@@ -73,16 +71,15 @@
         for temp in TEMP_THRESHOLDS
     },
     "mean_effective_temperature": utils.Transform(
-        source_variables=["mean_temperature", "relative_humidity", "wind_speed"],
-        transform_funcs=[utils.effective_temperature, utils.annual_mean],
+        source_variables=["effective_temperature"],
+        transform_funcs=[utils.annual_mean],
         encoding_scale=0.01,
         encoding_offset=273.15,
     ),
     **{
         f"days_over_{temp}C_effective_temperature": utils.Transform(
-            source_variables=["mean_temperature", "relative_humidity", "wind_speed"],
+            source_variables=["effective_temperature"],
             transform_funcs=[
-                utils.effective_temperature,
                 utils.count_threshold(temp),
                 utils.annual_sum,
             ],
@@ -108,7 +105,7 @@
 
 
 def generate_scenario_annual_main(
-    output_dir: str | Path, target_variable: str, scenario: str, year: str
+    output_dir: str | Path, target_variable: str, scenario: str, year: str, progress_bar: bool = False
 ) -> None:
     cd_data = ClimateDownscaleData(output_dir)
     transform = TRANSFORM_MAP[target_variable]
@@ -119,6 +116,13 @@ def generate_scenario_annual_main(
             for source_variable in transform.source_variables
         ]
     )
+
+    if progress_bar:
+        with ProgressBar():
+            ds = ds.compute()
+    else:
+        ds = ds.compute()
+    
     cd_data.save_annual_results(
         ds,
         scenario=scenario,
@@ -173,7 +177,7 @@ def generate_scenario_annual(
         else [target_variable]
     )
     experiments = (
-        list(clio.VALID_CMIP6_EXPERIMENTS)
+        clio.VALID_CMIP6_EXPERIMENTS + ['historical']
         if cmip6_experiment == clio.RUN_ALL
         else [cmip6_experiment]
     )
@@ -207,7 +211,7 @@ def generate_scenario_annual(
         },
         task_resources={
             "queue": queue,
-            "cores": 2,
+            "cores": 3,
             "memory": "100G",
             "runtime": "120m",
             "project": "proj_rapidresponse",

From 3b435d3aa2f5971d470a0eb56e87ee98c4cce370 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Mon, 8 Jul 2024 11:49:33 -0700
Subject: [PATCH 68/71] start scenario inclusion script

---
 .../generate/scenario_inclusion.py            | 78 +++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 src/climate_downscale/generate/scenario_inclusion.py

diff --git a/src/climate_downscale/generate/scenario_inclusion.py b/src/climate_downscale/generate/scenario_inclusion.py
new file mode 100644
index 0000000..6d416f4
--- /dev/null
+++ b/src/climate_downscale/generate/scenario_inclusion.py
@@ -0,0 +1,78 @@
+from pathlib import Path
+
+import pandas as pd
+import xarray as xr
+from rra_tools import parallel
+import tqdm
+
+from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
+
+import warnings
+
+warnings.filterwarnings('ignore')
+
+cd_data = ClimateDownscaleData(output_dir)
+paths = list(cd_data.extracted_cmip6.glob(f'*.nc'))
+
+def extract_metadata(data_path: Path) -> tuple:
+    variable, scenario, source, variant = data_path.stem.split('_')
+    
+    realization = variant.split('i')[0][1:]
+    initialization = variant.split('i')[1].split('p')[0]
+    physics = variant.split('p')[1].split('f')[0]
+    forcing = variant.split('f')[1]
+    
+    
+    ds = xr.open_dataset(data_path)
+    year_start = ds['time.year'].min().item()
+    year_end = ds['time.year'].max().item()
+    return (variable, scenario, source, variant, realization, initialization, physics, forcing, year_start, year_end)
+
+meta_list = parallel.run_parallel(
+    extract_metadata, 
+    paths, 
+    num_cores=25,
+    progress_bar=True,    
+)
+
+meta_df = (
+    pd.DataFrame(
+        meta_list, 
+        columns=[
+            'variable', 
+            'scenario', 
+            'source',
+            'variant', 
+            'realization', 
+            'initialization', 
+            'physics', 
+            'forcing', 
+            'year_start', 
+            'year_end',
+        ],
+    ).assign(
+        all_years=lambda x: (x.year_start <= 2020) & (x.year_end >= 2099),
+        year_range=lambda x: x.apply(lambda r: f"{r.loc['year_start']}_{r.loc['year_end']}", axis=1),
+    )
+)
+
+valid_scenarios = (
+    meta_df
+    .set_index(['variable', 'source', 'variant', 'scenario']).all_years
+    .unstack()
+    .fillna(False)
+    .sum(axis=1)
+    .rename('valid_scenarios')
+)
+year_range = (
+    meta_df
+    .set_index(['variable', 'source', 'variant', 'scenario']).year_range
+    .unstack()
+    .fillna("")
+)
+inclusion_df = pd.concat([
+    year_range, 
+    valid_scenarios,
+    meta_df.drop(columns=['scenario', 'year_start', 'year_end', 'all_years', 'year_range']).drop_duplicates().set_index(['variable', 'source', 'variant'])
+], axis=1)
+inclusion_df['include'] = inclusion_df.valid_scenarios == 5
\ No newline at end of file

From 70f5abf7af730aaff56828a0b35a41150d7daca1 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Mon, 8 Jul 2024 12:15:40 -0700
Subject: [PATCH 69/71] Add task to generate scenario inclusion metadata

---
 pyproject.toml                                |   3 +
 src/climate_downscale/data.py                 |  22 ++++
 src/climate_downscale/generate/__init__.py    |   5 +
 .../generate/derived_daily.py                 |  20 +--
 .../generate/scenario_annual.py               |  15 ++-
 .../generate/scenario_inclusion.py            | 124 +++++++++---------
 6 files changed, 111 insertions(+), 78 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 731596e..7e91877 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -100,6 +100,9 @@ ignore = [
     "RET504",   # Unnecessary assignment before return
     "PLR0913",  # Too many arguments in function call, hard with CLIs.
     "TRY201",   #
+    "PD010",    # I like stack and unstack
+    "FBT001",   # Boolean positional args are super common in clis
+    "FBT002",   # Boolean positional args are super common in clis
 ]
 
 [tool.ruff.lint.per-file-ignores]
diff --git a/src/climate_downscale/data.py b/src/climate_downscale/data.py
index 5bf53f7..71fa346 100644
--- a/src/climate_downscale/data.py
+++ b/src/climate_downscale/data.py
@@ -126,6 +126,28 @@ def results(self) -> Path:
     def daily_results(self) -> Path:
         return self.results / "daily"
 
+    @property
+    def results_metadata(self) -> Path:
+        return self.results / "metadata"
+
+    def save_scenario_metadata(self, df: pd.DataFrame) -> None:
+        path = self.results_metadata / "scenario_metadata.parquet"
+        touch(path, exist_ok=True)
+        df.to_parquet(path)
+
+    def load_scenario_metadata(self) -> pd.DataFrame:
+        path = self.results_metadata / "scenario_metadata.parquet"
+        return pd.read_parquet(path)
+
+    def save_scenario_inclusion_metadata(self, df: pd.DataFrame) -> None:
+        path = self.results_metadata / "scenario_inclusion_metadata.parquet"
+        touch(path, exist_ok=True)
+        df.to_parquet(path)
+
+    def load_scenario_inclusion_metadata(self) -> pd.DataFrame:
+        path = self.results_metadata / "scenario_inclusion_metadata.parquet"
+        return pd.read_parquet(path)
+
     def daily_results_path(self, scenario: str, variable: str, year: int | str) -> Path:
         return self.daily_results / scenario / variable / f"{year}.nc"
 
diff --git a/src/climate_downscale/generate/__init__.py b/src/climate_downscale/generate/__init__.py
index ded9def..d17a310 100644
--- a/src/climate_downscale/generate/__init__.py
+++ b/src/climate_downscale/generate/__init__.py
@@ -18,10 +18,14 @@
     generate_scenario_daily,
     generate_scenario_daily_task,
 )
+from climate_downscale.generate.scenario_inclusion import (
+    generate_scenario_inclusion,
+)
 
 RUNNERS = {
     "historical_daily": generate_historical_daily,
     "historical_reference": generate_historical_reference,
+    "scenario_inclusion": generate_scenario_inclusion,
     "scenario_daily": generate_scenario_daily,
     "derived_daily": generate_derived_daily,
     "scenario_annual": generate_scenario_annual,
@@ -30,6 +34,7 @@
 TASK_RUNNERS = {
     "historical_daily": generate_historical_daily_task,
     "historical_reference": generate_historical_reference_task,
+    "scenario_inclusion": generate_scenario_inclusion,
     "scenario_daily": generate_scenario_daily_task,
     "derived_daily": generate_derived_daily_task,
     "scenario_annual": generate_scenario_annual_task,
diff --git a/src/climate_downscale/generate/derived_daily.py b/src/climate_downscale/generate/derived_daily.py
index 1a4735b..77e7d52 100644
--- a/src/climate_downscale/generate/derived_daily.py
+++ b/src/climate_downscale/generate/derived_daily.py
@@ -1,8 +1,8 @@
 import itertools
 
 import click
+from dask.diagnostics.progress import ProgressBar
 from rra_tools import jobmon
-from dask.diagnostics import ProgressBar
 
 from climate_downscale import cli_options as clio
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
@@ -35,27 +35,27 @@ def generate_derived_daily_main(
     target_variable: str,
     scenario: str,
     year: str,
-    progress_bar: bool = False
+    progress_bar: bool = False,
 ) -> None:
     cd_data = ClimateDownscaleData(output_dir)
     transform = TRANSFORM_MAP[target_variable]
 
-    # Empirically tested to find a good balance between 
+    # Empirically tested to find a good balance between
     # runtime and memory usage for data at this scale.
     chunks = {"latitude": -1, "longitude": -1, "date": 20}
 
     ds = transform(
         *[
-            cd_data.load_daily_results(scenario, source_variable, year).chunk(**chunks)
+            cd_data.load_daily_results(scenario, source_variable, year).chunk(**chunks)  # type: ignore[arg-type]
             for source_variable in transform.source_variables
         ]
     )
     if progress_bar:
-        with ProgressBar():
+        with ProgressBar():  # type: ignore[no-untyped-call]
             ds = ds.compute()
     else:
         ds = ds.compute()
-        
+
     cd_data.save_daily_results(
         ds,
         scenario=scenario,
@@ -99,7 +99,7 @@ def generate_derived_daily(
     target_variable: str,
     cmip6_experiment: str,
     queue: str,
-    overwrite: bool,  # noqa: FBT001
+    overwrite: bool,
 ) -> None:
     cd_data = ClimateDownscaleData(output_dir)
 
@@ -109,7 +109,7 @@ def generate_derived_daily(
         else [target_variable]
     )
     experiments = (
-        clio.VALID_CMIP6_EXPERIMENTS + ['historical']
+        [*clio.VALID_CMIP6_EXPERIMENTS, "historical"]
         if cmip6_experiment == clio.RUN_ALL
         else [cmip6_experiment]
     )
@@ -125,11 +125,11 @@ def generate_derived_daily(
             if path.exists() and path.stat().st_size == 0:
                 # job failed when writing, delete the file
                 path.unlink()
-                
+
             if not path.exists() or overwrite:
                 vey.append((v, e, y))
             else:
-                complete.append((v, e, y))    
+                complete.append((v, e, y))
 
     print(f"{len(complete)} tasks already done. {len(vey)} tasks to do.")
     if not vey:
diff --git a/src/climate_downscale/generate/scenario_annual.py b/src/climate_downscale/generate/scenario_annual.py
index 83ceab9..1508477 100644
--- a/src/climate_downscale/generate/scenario_annual.py
+++ b/src/climate_downscale/generate/scenario_annual.py
@@ -3,6 +3,7 @@
 
 import click
 import xarray as xr
+from dask.diagnostics.progress import ProgressBar
 from rra_tools import jobmon
 
 from climate_downscale import cli_options as clio
@@ -105,7 +106,11 @@
 
 
 def generate_scenario_annual_main(
-    output_dir: str | Path, target_variable: str, scenario: str, year: str, progress_bar: bool = False
+    output_dir: str | Path,
+    target_variable: str,
+    scenario: str,
+    year: str,
+    progress_bar: bool = False,
 ) -> None:
     cd_data = ClimateDownscaleData(output_dir)
     transform = TRANSFORM_MAP[target_variable]
@@ -118,11 +123,11 @@ def generate_scenario_annual_main(
     )
 
     if progress_bar:
-        with ProgressBar():
+        with ProgressBar():  # type: ignore[no-untyped-call]
             ds = ds.compute()
     else:
         ds = ds.compute()
-    
+
     cd_data.save_annual_results(
         ds,
         scenario=scenario,
@@ -167,7 +172,7 @@ def generate_scenario_annual(
     target_variable: str,
     cmip6_experiment: str,
     queue: str,
-    overwrite: bool,  # noqa: FBT001
+    overwrite: bool,
 ) -> None:
     cd_data = ClimateDownscaleData(output_dir)
 
@@ -177,7 +182,7 @@ def generate_scenario_annual(
         else [target_variable]
     )
     experiments = (
-        clio.VALID_CMIP6_EXPERIMENTS + ['historical']
+        [*clio.VALID_CMIP6_EXPERIMENTS, "historical"]
         if cmip6_experiment == clio.RUN_ALL
         else [cmip6_experiment]
     )
diff --git a/src/climate_downscale/generate/scenario_inclusion.py b/src/climate_downscale/generate/scenario_inclusion.py
index 6d416f4..63dc1ea 100644
--- a/src/climate_downscale/generate/scenario_inclusion.py
+++ b/src/climate_downscale/generate/scenario_inclusion.py
@@ -1,78 +1,76 @@
+import warnings
 from pathlib import Path
+from typing import Any
 
+import click
 import pandas as pd
 import xarray as xr
 from rra_tools import parallel
-import tqdm
 
+from climate_downscale import cli_options as clio
 from climate_downscale.data import DEFAULT_ROOT, ClimateDownscaleData
 
-import warnings
-
-warnings.filterwarnings('ignore')
+warnings.filterwarnings("ignore")
 
-cd_data = ClimateDownscaleData(output_dir)
-paths = list(cd_data.extracted_cmip6.glob(f'*.nc'))
 
-def extract_metadata(data_path: Path) -> tuple:
-    variable, scenario, source, variant = data_path.stem.split('_')
-    
-    realization = variant.split('i')[0][1:]
-    initialization = variant.split('i')[1].split('p')[0]
-    physics = variant.split('p')[1].split('f')[0]
-    forcing = variant.split('f')[1]
-    
-    
+def extract_metadata(data_path: Path) -> tuple[Any]:
+    meta = data_path.stem.split("_")
     ds = xr.open_dataset(data_path)
-    year_start = ds['time.year'].min().item()
-    year_end = ds['time.year'].max().item()
-    return (variable, scenario, source, variant, realization, initialization, physics, forcing, year_start, year_end)
+    year_start = ds["time.year"].min().item()
+    year_end = ds["time.year"].max().item()
+    return *meta, year_start, year_end, str(data_path)
+
+
+def generate_scenario_inclusion_main(
+    output_dir: str | Path, *, num_cores: int = 1, progress_bar: bool = False
+) -> None:
+    cd_data = ClimateDownscaleData(output_dir)
+    paths = list(cd_data.extracted_cmip6.glob("*.nc"))
+
+    meta_list = parallel.run_parallel(
+        extract_metadata,
+        paths,
+        num_cores=num_cores,
+        progress_bar=progress_bar,
+    )
 
-meta_list = parallel.run_parallel(
-    extract_metadata, 
-    paths, 
-    num_cores=25,
-    progress_bar=True,    
-)
+    columns = ["variable", "scenario", "source", "variant", "year_start", "year_end"]
+    meta_df = pd.DataFrame(meta_list, columns=columns)
+    meta_df["all_years"] = (meta_df.year_start <= 2020) & (meta_df.year_end >= 2099)  # noqa: PLR2004
+    meta_df["year_range"] = meta_df.apply(
+        lambda r: f"{r.loc['year_start']}_{r.loc['year_end']}", axis=1
+    )
 
-meta_df = (
-    pd.DataFrame(
-        meta_list, 
-        columns=[
-            'variable', 
-            'scenario', 
-            'source',
-            'variant', 
-            'realization', 
-            'initialization', 
-            'physics', 
-            'forcing', 
-            'year_start', 
-            'year_end',
-        ],
-    ).assign(
-        all_years=lambda x: (x.year_start <= 2020) & (x.year_end >= 2099),
-        year_range=lambda x: x.apply(lambda r: f"{r.loc['year_start']}_{r.loc['year_end']}", axis=1),
+    valid_scenarios = (
+        meta_df.set_index(["variable", "source", "variant", "scenario"])
+        .all_years.unstack()
+        .fillna(value=False)
+        .sum(axis=1)
+        .rename("valid_scenarios")
+    )
+    year_range = (
+        meta_df.set_index(["variable", "source", "variant", "scenario"])
+        .year_range.unstack()
+        .fillna("")
     )
-)
+    inclusion_df = pd.concat([year_range, valid_scenarios], axis=1).reset_index()
+    inclusion_df["include"] = inclusion_df.valid_scenarios == 5  # noqa: PLR2004
 
-valid_scenarios = (
-    meta_df
-    .set_index(['variable', 'source', 'variant', 'scenario']).all_years
-    .unstack()
-    .fillna(False)
-    .sum(axis=1)
-    .rename('valid_scenarios')
-)
-year_range = (
-    meta_df
-    .set_index(['variable', 'source', 'variant', 'scenario']).year_range
-    .unstack()
-    .fillna("")
-)
-inclusion_df = pd.concat([
-    year_range, 
-    valid_scenarios,
-    meta_df.drop(columns=['scenario', 'year_start', 'year_end', 'all_years', 'year_range']).drop_duplicates().set_index(['variable', 'source', 'variant'])
-], axis=1)
-inclusion_df['include'] = inclusion_df.valid_scenarios == 5
\ No newline at end of file
+    cd_data.save_scenario_metadata(meta_df)
+    cd_data.save_scenario_inclusion_metadata(inclusion_df)
+
+
+@click.command()  # type: ignore[arg-type]
+@clio.with_output_directory(DEFAULT_ROOT)
+@clio.with_num_cores(default=10)
+@clio.with_progress_bar()
+def generate_scenario_inclusion(
+    output_dir: str,
+    num_cores: int,
+    progress_bar: bool,
+) -> None:
+    generate_scenario_inclusion_main(
+        output_dir,
+        num_cores=num_cores,
+        progress_bar=progress_bar,
+    )

From ae7b4772b2eecbff6b1d8627144da1a12e58a3d0 Mon Sep 17 00:00:00 2001
From: collijk <collijk@uw.edu>
Date: Mon, 8 Jul 2024 12:19:18 -0700
Subject: [PATCH 70/71] Remove extra path column

---
 src/climate_downscale/generate/scenario_inclusion.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/climate_downscale/generate/scenario_inclusion.py b/src/climate_downscale/generate/scenario_inclusion.py
index 63dc1ea..bf5a0e7 100644
--- a/src/climate_downscale/generate/scenario_inclusion.py
+++ b/src/climate_downscale/generate/scenario_inclusion.py
@@ -13,12 +13,12 @@
 warnings.filterwarnings("ignore")
 
 
-def extract_metadata(data_path: Path) -> tuple[Any]:
+def extract_metadata(data_path: Path) -> tuple[Any, ...]:
     meta = data_path.stem.split("_")
     ds = xr.open_dataset(data_path)
     year_start = ds["time.year"].min().item()
     year_end = ds["time.year"].max().item()
-    return *meta, year_start, year_end, str(data_path)
+    return *meta, year_start, year_end
 
 
 def generate_scenario_inclusion_main(

From cef92b3c4e21d45e85cc43cf989cc0158f42b5d9 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Tue, 9 Jul 2024 05:40:51 -0700
Subject: [PATCH 71/71] So many fixes

---
 src/climate_downscale/data.py                 |   2 +-
 .../generate/derived_daily.py                 |   3 -
 .../generate/historical_daily.py              |   7 +-
 .../generate/historical_reference.py          |   2 +-
 .../generate/scenario_annual.py               |   6 -
 .../generate/scenario_daily.py                | 139 ++++++++++--------
 .../generate/scenario_inclusion.py            |   6 +
 7 files changed, 84 insertions(+), 81 deletions(-)

diff --git a/src/climate_downscale/data.py b/src/climate_downscale/data.py
index 71fa346..77be728 100644
--- a/src/climate_downscale/data.py
+++ b/src/climate_downscale/data.py
@@ -37,7 +37,7 @@ def extracted_era5(self) -> Path:
     def extracted_era5_path(
         self, dataset: str, variable: str, year: int | str, month: str
     ) -> Path:
-        return self.extracted_era5 / f"{dataset}_{variable}_{year}_{month}.nc"
+        return self.extracted_era5 / f"reanalysis-era5-{dataset}_{variable}_{year}_{month}.nc"
 
     @property
     def extracted_cmip6(self) -> Path:
diff --git a/src/climate_downscale/generate/derived_daily.py b/src/climate_downscale/generate/derived_daily.py
index 77e7d52..50c5309 100644
--- a/src/climate_downscale/generate/derived_daily.py
+++ b/src/climate_downscale/generate/derived_daily.py
@@ -13,19 +13,16 @@
         source_variables=["mean_temperature", "relative_humidity"],
         transform_funcs=[utils.heat_index],
         encoding_scale=0.01,
-        encoding_offset=273.15,
     ),
     "humidex": utils.Transform(
         source_variables=["mean_temperature", "relative_humidity"],
         transform_funcs=[utils.humidex],
         encoding_scale=0.01,
-        encoding_offset=273.15,
     ),
     "effective_temperature": utils.Transform(
         source_variables=["mean_temperature", "relative_humidity", "wind_speed"],
         transform_funcs=[utils.effective_temperature],
         encoding_scale=0.01,
-        encoding_offset=273.15,
     ),
 }
 
diff --git a/src/climate_downscale/generate/historical_daily.py b/src/climate_downscale/generate/historical_daily.py
index 5eeddde..6585318 100644
--- a/src/climate_downscale/generate/historical_daily.py
+++ b/src/climate_downscale/generate/historical_daily.py
@@ -35,19 +35,16 @@
         source_variables=["2m_temperature"],
         transform_funcs=[utils.daily_mean],
         encoding_scale=0.01,
-        encoding_offset=273.15,
     ),
     "max_temperature": utils.Transform(
         source_variables=["2m_temperature"],
         transform_funcs=[utils.daily_max],
         encoding_scale=0.01,
-        encoding_offset=273.15,
     ),
     "min_temperature": utils.Transform(
         source_variables=["2m_temperature"],
         transform_funcs=[utils.daily_min],
         encoding_scale=0.01,
-        encoding_offset=273.15,
     ),
     "wind_speed": utils.Transform(
         source_variables=["10m_u_component_of_wind", "10m_v_component_of_wind"],
@@ -98,8 +95,8 @@ def load_variable(
         ds = load_and_shift_longitude(path)
         # There are some slight numerical differences in the lat/long for some of
         # the land datasets. They are gridded consistently, so just tweak the
-        # coordinates so things align.
-        ds = ds.assign_coords(latitude=utils.TARGET_LAT, longitude=utils.TARGET_LON)
+        # coordinates so things align.        
+        ds = ds.assign_coords(latitude=utils.TARGET_LAT[::-1], longitude=utils.TARGET_LON)
     else:
         ds = load_and_shift_longitude(path)
     conversion = CONVERT_MAP[variable]
diff --git a/src/climate_downscale/generate/historical_reference.py b/src/climate_downscale/generate/historical_reference.py
index de030a2..14f84ff 100644
--- a/src/climate_downscale/generate/historical_reference.py
+++ b/src/climate_downscale/generate/historical_reference.py
@@ -63,7 +63,7 @@ def generate_historical_reference_task(
 
 @click.command()  # type: ignore[arg-type]
 @clio.with_output_directory(DEFAULT_ROOT)
-@clio.with_target_variable(variable_names=list(TRANSFORM_MAP))
+@clio.with_target_variable(allow_all=True, variable_names=list(TRANSFORM_MAP))
 @clio.with_queue()
 def generate_historical_reference(
     output_dir: str,
diff --git a/src/climate_downscale/generate/scenario_annual.py b/src/climate_downscale/generate/scenario_annual.py
index 1508477..455614e 100644
--- a/src/climate_downscale/generate/scenario_annual.py
+++ b/src/climate_downscale/generate/scenario_annual.py
@@ -18,19 +18,16 @@
         source_variables=["mean_temperature"],
         transform_funcs=[utils.annual_mean],
         encoding_scale=0.01,
-        encoding_offset=273.15,
     ),
     "mean_high_temperature": utils.Transform(
         source_variables=["max_temperature"],
         transform_funcs=[utils.annual_mean],
         encoding_scale=0.01,
-        encoding_offset=273.15,
     ),
     "mean_low_temperature": utils.Transform(
         source_variables=["min_temperature"],
         transform_funcs=[utils.annual_mean],
         encoding_scale=0.01,
-        encoding_offset=273.15,
     ),
     **{
         f"days_over_{temp}C": utils.Transform(
@@ -43,7 +40,6 @@
         source_variables=["heat_index"],
         transform_funcs=[utils.annual_mean],
         encoding_scale=0.01,
-        encoding_offset=273.15,
     ),
     **{
         f"days_over_{temp}C_heat_index": utils.Transform(
@@ -59,7 +55,6 @@
         source_variables=["humidex"],
         transform_funcs=[utils.annual_mean],
         encoding_scale=0.01,
-        encoding_offset=273.15,
     ),
     **{
         f"days_over_{temp}C_humidex": utils.Transform(
@@ -75,7 +70,6 @@
         source_variables=["effective_temperature"],
         transform_funcs=[utils.annual_mean],
         encoding_scale=0.01,
-        encoding_offset=273.15,
     ),
     **{
         f"days_over_{temp}C_effective_temperature": utils.Transform(
diff --git a/src/climate_downscale/generate/scenario_daily.py b/src/climate_downscale/generate/scenario_daily.py
index 12cd391..0cf48a7 100644
--- a/src/climate_downscale/generate/scenario_daily.py
+++ b/src/climate_downscale/generate/scenario_daily.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 import itertools
 from pathlib import Path
 
@@ -33,7 +34,6 @@
             source_variables=["tas"],
             transform_funcs=[utils.identity],
             encoding_scale=0.01,
-            encoding_offset=273.15,
         ),
         "additive",
     ),
@@ -42,7 +42,6 @@
             source_variables=["tasmax"],
             transform_funcs=[utils.identity],
             encoding_scale=0.01,
-            encoding_offset=273.15,
         ),
         "additive",
     ),
@@ -51,7 +50,6 @@
             source_variables=["tasmin"],
             transform_funcs=[utils.identity],
             encoding_scale=0.01,
-            encoding_offset=273.15,
         ),
         "additive",
     ),
@@ -86,29 +84,16 @@ def get_source_paths(
     cd_data: ClimateDownscaleData,
     source_variables: list[str],
     cmip6_experiment: str,
-) -> list[list[Path]]:
-    models_by_var = {}
-    for source_variable in source_variables:
-        model_vars = {
-            p.stem.split(f"{cmip6_experiment}_")[1]
-            for p in cd_data.extracted_cmip6.glob(
-                f"{source_variable}_{cmip6_experiment}*.nc"
-            )
-        }
-        models_by_var[source_variable] = model_vars
-
-    shared_models = set.intersection(*models_by_var.values())
-    for var, models in models_by_var.items():
-        extra_models = models.difference(shared_models)
-        if extra_models:
-            print(var, extra_models)
-    source_paths = [
-        [
-            cd_data.extracted_cmip6 / f"{source_variable}_{cmip6_experiment}_{model}.nc"
-            for source_variable in source_variables
-        ]
-        for model in sorted(shared_models)
-    ]
+) -> dict[str, list[list[Path]]]:
+    inclusion_meta = cd_data.load_scenario_inclusion_metadata()[source_variables]
+    inclusion_meta = inclusion_meta[inclusion_meta.all(axis=1)]
+    source_paths = defaultdict(list)
+    for source, variant in inclusion_meta.index.tolist():
+        source_paths[source].append(
+            [cd_data.extracted_cmip6_path(v, cmip6_experiment, source, variant) 
+             for v in source_variables]
+        )    
+    
     return source_paths
 
 
@@ -187,52 +172,76 @@ def generate_scenario_daily_main(  # noqa: PLR0912
         year="reference",
     )
 
-    anomalies: dict[str, xr.Dataset] = {}
-    for i, sps in enumerate(source_paths):
-        pid = f"{i+1}/{len(source_paths)} {sps[0].stem}"
-        print(f"{pid}: Loading reference")
-        try:
-            scenario_reference = transform(
-                *[load_variable(sp, "reference") for sp in sps]
-            )
-            print(f"{pid}: Loading target")
-            target = transform(*[load_variable(sp, year) for sp in sps])
-        except KeyError:
-            print(f"{pid}: Bad formatting, skipping...")
-            continue
-        print(f"{pid}: computing anomaly")
-        s_anomaly = compute_anomaly(scenario_reference, target, anomaly_type)
-        key = f"{len(s_anomaly.latitude)}_{len(s_anomaly.longitude)}"
+    anomalies: dict[str, dict[str, tuple[int, xr.Dataset]]] = {}
+    for i, (source, variant_paths) in enumerate(source_paths.items()):
+        sid = f"Source {i+1}/{len(source_paths)}: {source}"
 
-        if key in anomalies:
-            old = anomalies[key]
-            for coord in ["latitude", "longitude"]:
-                old_c = old[coord].to_numpy()
-                new_c = s_anomaly[coord].to_numpy()
-                tol = 1e-5
-                if np.abs(old_c - new_c).max() < tol:
-                    s_anomaly = s_anomaly.assign({coord: old_c})
-                else:
-                    msg = f"{coord} does not match despite having the same subdivision"
-                    raise ValueError(msg)
-            anomalies[key] = old + s_anomaly
-        else:
-            anomalies[key] = s_anomaly
+        source_anomalies: dict[str, tuple[int, xr.Dataset]] = {}
+        for j, vps in enumerate(variant_paths):            
+            vid = f"{sid}, Variant {j+1}/{len(variant_paths)}: {vps[0].stem.split('_')[-1]}"
+            try:
+                print(f"{vid}: Loading reference")
+                sref = transform(*[load_variable(vp, "reference") for vp in vps])
+                print(f"{vid}: Loading target")
+                target = transform(*[load_variable(vp, year) for vp in vps])
+            except KeyError:
+                print(f"{vid}: Bad formatting, skipping...")
+                continue
+            
+            print(f"{vid}: computing anomaly")
+            v_anomaly = compute_anomaly(sref, target, anomaly_type)
+            
+            key = f"{len(v_anomaly.latitude)}_{len(v_anomaly.longitude)}"
+
+            if key in source_anomalies:
+                old_count, old_anomaly = source_anomalies[key]
+                
+                for coord in ["latitude", "longitude"]:
+                    old_c = old_anomaly[coord].to_numpy()
+                    new_c = v_anomaly[coord].to_numpy()
+                    tol = 1e-5
+                    
+                    if np.abs(old_c - new_c).max() < tol:
+                        v_anomaly = v_anomaly.assign({coord: old_c})
+                    else:
+                        msg = f"{coord} does not match despite having the same subdivision"
+                        raise ValueError(msg)
+                source_anomalies[key] = old_count + 1, old_anomaly + v_anomaly
+            else:
+                source_anomalies[key] = 1, v_anomaly
+        if source_anomalies:
+            anomalies[source] = source_anomalies
+
+    ensemble_anomaly = xr.Dataset()    
+    for i, (source, source_anomalies) in enumerate(anomalies.items()):
+        sid = f"Source {i+1}/{len(source_paths)}: {source}"
+        print(f"Downscaling {i+1}/{len(anomalies)}: {source}")
+
+        source_ensemble_anomaly = xr.Dataset()
+        total_count = 0
+        for j, (res, (count, v_anomaly)) in enumerate(source_anomalies.items()):
+            res_id = f"{sid}, Resolution {j} / {len(source_anomalies)}: {res}"
+            print(f"Downscaling {res_id}")
+        
+            if source_ensemble_anomaly.nbytes:
+                source_ensemble_anomaly += utils.interpolate_to_target_latlon(v_anomaly, method="linear")
+            else:
+                source_ensemble_anomaly = utils.interpolate_to_target_latlon(v_anomaly, method="linear")
+            total_count += count
+        source_ensemble_anomaly /= total_count
 
-    anomaly = xr.Dataset()
-    for i, (k, v) in enumerate(anomalies.items()):
-        print(f"Downscaling {i+1}/{len(anomalies)}: {k}")
-        if anomaly.nbytes:
-            anomaly += utils.interpolate_to_target_latlon(v, method="linear")
+        if ensemble_anomaly.nbytes:
+            ensemble_anomaly += source_ensemble_anomaly
         else:
-            anomaly = utils.interpolate_to_target_latlon(v, method="linear")
-    anomaly /= len(source_paths)
+            ensemble_anomaly = source_ensemble_anomaly
+    
+    ensemble_anomaly /= len(anomalies)
 
     print("Computing scenario data")
     if anomaly_type == "additive":
-        scenario_data = historical_reference + anomaly.groupby("date.month")
+        scenario_data = historical_reference + ensemble_anomaly.groupby("date.month")
     else:
-        scenario_data = historical_reference * anomaly.groupby("date.month")
+        scenario_data = historical_reference * ensemble_anomaly.groupby("date.month")
     scenario_data = scenario_data.drop_vars("month")
     print("Saving")
     cd_data.save_daily_results(
diff --git a/src/climate_downscale/generate/scenario_inclusion.py b/src/climate_downscale/generate/scenario_inclusion.py
index bf5a0e7..a04a277 100644
--- a/src/climate_downscale/generate/scenario_inclusion.py
+++ b/src/climate_downscale/generate/scenario_inclusion.py
@@ -55,6 +55,12 @@ def generate_scenario_inclusion_main(
     )
     inclusion_df = pd.concat([year_range, valid_scenarios], axis=1).reset_index()
     inclusion_df["include"] = inclusion_df.valid_scenarios == 5  # noqa: PLR2004
+    inclusion_df = (
+        inclusion_df.loc[inclusion_df.include]
+        .set_index(['source', 'variant', 'variable']).include
+        .unstack()
+        .fillna(False)
+    )
 
     cd_data.save_scenario_metadata(meta_df)
     cd_data.save_scenario_inclusion_metadata(inclusion_df)