From 5de498e75e53193a3715b046dc654d140e27a4e8 Mon Sep 17 00:00:00 2001
From: Zane Selvans <zane.selvans@catalyst.coop>
Date: Fri, 18 Aug 2023 19:42:41 -0600
Subject: [PATCH 1/7] Allow a mix of Zenodo sandbox & production DOIs

Okay I did this off the clock since it has been driving me a little bit nuts.

Historically we've required that all Zenodo DOIs in the datastore come either
from the Sandbox or the Production server, which makes testing a single new
archive on its own a hassle, and adds complexity across the whole application
with switches for sandbox vs. not-sandbox data sources.

This commit removes this requirement, and allows a mix of sandbox and production
DOIs to be used in development.

I also removed some very sparse documentation about how to create an archive in
the Datastore by hand, which I think was very old and probably no longer supported
and certainly not being tested, since it seemed likely to confuse and frustrate
anyone who actually tried to do it.

There's a unit test which checks that all DOIs are production, rather than sandbox
to make it difficult to accidentally check in code that refers to unofficial
input data.
---
 README.rst                                  |   2 +-
 docs/dev/datastore.rst                      |  39 ++----
 docs/dev/testing.rst                        |   1 -
 src/pudl/cli/etl.py                         |   7 -
 src/pudl/ferc_to_sqlite/cli.py              |   7 -
 src/pudl/metadata/classes.py                |   1 -
 src/pudl/resources.py                       |   6 -
 src/pudl/workspace/datastore.py             | 137 +++++++++-----------
 test/conftest.py                            |   7 -
 test/integration/zenodo_datapackage_test.py |  21 ++-
 test/unit/workspace/datastore_test.py       |  35 +++--
 11 files changed, 99 insertions(+), 164 deletions(-)

diff --git a/README.rst b/README.rst
index f328e7a7f9..aa3a4f8a42 100644
--- a/README.rst
+++ b/README.rst
@@ -124,7 +124,7 @@ Want access to all the published data in bulk? If you're familiar with Python
 and `Jupyter Notebooks <https://jupyter.org/>`__ and are willing to install Docker you
 can:
 
-* `Download a PUDL data release <https://sandbox.zenodo.org/record/764696>`__ from
+* `Download a PUDL data release <https://zenodo.org/record/3653158>`__ from
   CERN's `Zenodo <https://zenodo.org>`__ archiving service.
 * `Install Docker <https://docs.docker.com/get-docker/>`__
 * Run the archived image using ``docker-compose up``
diff --git a/docs/dev/datastore.rst b/docs/dev/datastore.rst
index e9411537f6..eb3f2f452e 100644
--- a/docs/dev/datastore.rst
+++ b/docs/dev/datastore.rst
@@ -38,15 +38,17 @@ For more detailed usage information, see:
     $ pudl_datastore --help
 
 The downloaded data will be used by the script to populate a datastore under
-the ``data`` directory in your workspace, organized by data source, form, and
-date::
+the your ``$PUDL_INPUT`` directory, organized by data source, form, and DOI::
 
     data/censusdp1tract/
     data/eia860/
+    data/eia860m/
     data/eia861/
     data/eia923/
     data/epacems/
     data/ferc1/
+    data/ferc2/
+    data/ferc60/
     data/ferc714/
 
 If the download fails to complete successfully, the script can be run repeatedly until
@@ -64,28 +66,13 @@ archival and versioning of datasets. See the `documentation
 for information on adding datasets to the datastore.
 
 
-Prepare the Datastore
-^^^^^^^^^^^^^^^^^^^^^
+Tell PUDL about the archive
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-If you have used pudl-archiver to prepare a Zenodo archive as above, you
-can add support for your archive to the datastore by adding the DOI to
-pudl.workspace.datastore.DOI, under "sandbox" or "production" as appropriate.
-
-If you want to prepare an archive for the datastore separately, the following
-are required.
-
-#. The root path must contain a ``datapackage.json`` file that conforms to the
-`frictionless datapackage spec <https://specs.frictionlessdata.io/data-package/>`__
-#. Each listed resource among the ``datapackage.json`` resources must include:
-
-   * ``path`` containing the zenodo download url for the specific file.
-   * ``remote_url`` with the same url as the ``path``
-   * ``name`` of the file
-   * ``hash`` with the md5 hash of the file
-   * ``parts`` a set of key / value pairs defining additional attributes that
-     can be used to select a subset of the whole datapackage. For example, the
-     ``epacems`` dataset is partitioned by year and state, and
-     ``"parts": {"year": 2010, "state": "ca"}`` would indicate that the
-     resource contains data for the state of California in the year 2010.
-     Unpartitioned datasets like the ``ferc714`` which includes all years in
-     a single file, would have an empty ``"parts": {}``
+Once you have used pudl-archiver to prepare a Zenodo archive as above, you
+can make the PUDL Datastore aware of it by updating the appropriate DOI in
+:class:`pudl.workspace.datastore.ZenodoFetcher`. DOIs can refer to resources from the
+`Zenodo sandbox server <https://sandbox.zenodo.org>`__ for archives that are still in
+testing or development (sandbox DOIs have a prefix of ``10.5072``), or the
+`Zenodo production server <https://zenodo.org>`__ server if the archive is ready for
+public use (production DOIs hae a prefix of ``10.5281``).
diff --git a/docs/dev/testing.rst b/docs/dev/testing.rst
index 79439e468f..679074661f 100644
--- a/docs/dev/testing.rst
+++ b/docs/dev/testing.rst
@@ -304,7 +304,6 @@ You can always check to see what custom flags exist by running
                         Path to a non-standard ETL settings file to use.
   --gcs-cache-path=GCS_CACHE_PATH
                         If set, use this GCS path as a datastore cache layer.
-  --sandbox             Use raw inputs from the Zenodo sandbox server.
 
 The main flexibility that these custom options provide is in selecting where
 the raw input data comes from and what data the tests should be run
diff --git a/src/pudl/cli/etl.py b/src/pudl/cli/etl.py
index 39707ae775..604c754ee8 100644
--- a/src/pudl/cli/etl.py
+++ b/src/pudl/cli/etl.py
@@ -45,12 +45,6 @@ def parse_command_line(argv):
     parser.add_argument(
         dest="settings_file", type=str, default="", help="path to ETL settings file."
     )
-    parser.add_argument(
-        "--sandbox",
-        action="store_true",
-        default=False,
-        help="Use the Zenodo sandbox rather than production",
-    )
     parser.add_argument(
         "--logfile",
         default=None,
@@ -156,7 +150,6 @@ def main():
                 "dataset_settings": {"config": dataset_settings_config},
                 "datastore": {
                     "config": {
-                        "sandbox": args.sandbox,
                         "gcs_cache_path": args.gcs_cache_path
                         if args.gcs_cache_path
                         else "",
diff --git a/src/pudl/ferc_to_sqlite/cli.py b/src/pudl/ferc_to_sqlite/cli.py
index f7cbf9af5e..7a18f613cf 100755
--- a/src/pudl/ferc_to_sqlite/cli.py
+++ b/src/pudl/ferc_to_sqlite/cli.py
@@ -52,12 +52,6 @@ def parse_command_line(argv):
         fail.""",
         default=False,
     )
-    parser.add_argument(
-        "--sandbox",
-        action="store_true",
-        default=False,
-        help="Use the Zenodo sandbox rather than production",
-    )
     parser.add_argument(
         "-b",
         "--batch-size",
@@ -155,7 +149,6 @@ def main():  # noqa: C901
                 },
                 "datastore": {
                     "config": {
-                        "sandbox": args.sandbox,
                         "gcs_cache_path": args.gcs_cache_path
                         if args.gcs_cache_path
                         else "",
diff --git a/src/pudl/metadata/classes.py b/src/pudl/metadata/classes.py
index 4404065763..43467d72ec 100644
--- a/src/pudl/metadata/classes.py
+++ b/src/pudl/metadata/classes.py
@@ -949,7 +949,6 @@ def get_temporal_coverage(self, partitions: dict = None) -> str:
     def add_datastore_metadata(self) -> None:
         """Get source file metadata from the datastore."""
         dp_desc = Datastore(
-            sandbox=False,
             local_cache_path=PudlPaths().data_dir,
             gcs_cache_path="gs://zenodo-cache.catalyst.coop",
         ).get_datapackage_descriptor(self.name)
diff --git a/src/pudl/resources.py b/src/pudl/resources.py
index 476e84fa54..13d2a50471 100644
--- a/src/pudl/resources.py
+++ b/src/pudl/resources.py
@@ -39,18 +39,12 @@ def ferc_to_sqlite_settings(init_context) -> FercToSqliteSettings:
             description="If enabled, the local file cache for datastore will be used.",
             default_value=True,
         ),
-        "sandbox": Field(
-            bool,
-            description="Use the Zenodo sandbox rather than production",
-            default_value=False,
-        ),
     },
 )
 def datastore(init_context) -> Datastore:
     """Dagster resource to interact with Zenodo archives."""
     ds_kwargs = {}
     ds_kwargs["gcs_cache_path"] = init_context.resource_config["gcs_cache_path"]
-    ds_kwargs["sandbox"] = init_context.resource_config["sandbox"]
 
     if init_context.resource_config["use_local_cache"]:
         # TODO(rousik): we could also just use PudlPaths().input_dir here, because
diff --git a/src/pudl/workspace/datastore.py b/src/pudl/workspace/datastore.py
index d0a5c2191d..78519aeffb 100644
--- a/src/pudl/workspace/datastore.py
+++ b/src/pudl/workspace/datastore.py
@@ -44,9 +44,9 @@ def __init__(self, datapackage_json: dict, dataset: str, doi: str):
         """Constructs DatapackageDescriptor.
 
         Args:
-          datapackage_json (dict): parsed datapackage.json describing this datapackage.
-          dataset (str): name of the dataset.
-          doi (str): DOI (aka version) of the dataset.
+          datapackage_json: parsed datapackage.json describing this datapackage.
+          dataset: The name (an identifying string) of the dataset.
+          doi: A versioned Digital Object Identifier for the dataset.
         """
         self.datapackage_json = datapackage_json
         self.dataset = dataset
@@ -167,54 +167,46 @@ class ZenodoFetcher:
     }
 
     DOI = {
-        "sandbox": {
-            "censusdp1tract": "10.5072/zenodo.674992",
-            "eia860": "10.5072/zenodo.1222854",
-            "eia860m": "10.5072/zenodo.1225517",
-            "eia861": "10.5072/zenodo.1229930",
-            "eia923": "10.5072/zenodo.1217724",
-            "eia_bulk_elec": "10.5072/zenodo.1103572",
-            "epacamd_eia": "10.5072/zenodo.1199170",
-            "epacems": "10.5072/zenodo.672963",
-            "ferc1": "10.5072/zenodo.1070868",
-            "ferc2": "10.5072/zenodo.1188447",
-            "ferc6": "10.5072/zenodo.1098088",
-            "ferc60": "10.5072/zenodo.1098089",
-            "ferc714": "10.5072/zenodo.1098302",
-        },
-        "production": {
-            "censusdp1tract": "10.5281/zenodo.4127049",
-            "eia860": "10.5281/zenodo.8164776",
-            "eia860m": "10.5281/zenodo.8188017",
-            "eia861": "10.5281/zenodo.8231268",
-            "eia923": "10.5281/zenodo.8172818",
-            "eia_bulk_elec": "10.5281/zenodo.7067367",
-            "epacamd_eia": "10.5281/zenodo.7900974",
-            "epacems": "10.5281/zenodo.6910058",
-            "ferc1": "10.5281/zenodo.7314437",
-            "ferc2": "10.5281/zenodo.8006881",
-            "ferc6": "10.5281/zenodo.7130141",
-            "ferc60": "10.5281/zenodo.7130146",
-            "ferc714": "10.5281/zenodo.7139875",
-        },
+        # Sandbox DOIs are provided for reference
+        "censusdp1tract": "10.5281/zenodo.4127049",
+        # "censusdp1tract": "10.5072/zenodo.674992",
+        "eia860": "10.5281/zenodo.8164776",
+        # "eia860": "10.5072/zenodo.1222854",
+        "eia860m": "10.5281/zenodo.8188017",
+        # "eia860m": "10.5072/zenodo.1225517",
+        "eia861": "10.5281/zenodo.8231268",
+        # "eia861": "10.5072/zenodo.1229930",
+        "eia923": "10.5281/zenodo.8172818",
+        # "eia923": "10.5072/zenodo.1217724",
+        "eia_bulk_elec": "10.5281/zenodo.7067367",
+        # "eia_bulk_elec": "10.5072/zenodo.1103572",
+        "epacamd_eia": "10.5281/zenodo.7900974",
+        # "epacamd_eia": "10.5072/zenodo.1199170",
+        "epacems": "10.5281/zenodo.6910058",
+        # "epacems": "10.5072/zenodo.672963",
+        "ferc1": "10.5281/zenodo.7314437",
+        # "ferc1": "10.5072/zenodo.1070868",
+        "ferc2": "10.5281/zenodo.8006881",
+        # "ferc2": "10.5072/zenodo.1188447",
+        "ferc6": "10.5281/zenodo.7130141",
+        # "ferc6": "10.5072/zenodo.1098088",
+        "ferc60": "10.5281/zenodo.7130146",
+        # "ferc60": "10.5072/zenodo.1098089",
+        "ferc714": "10.5281/zenodo.7139875",
+        # "ferc714": "10.5072/zenodo.1098302",
     }
     API_ROOT = {
         "sandbox": "https://sandbox.zenodo.org/api",
         "production": "https://zenodo.org/api",
     }
 
-    def __init__(self, sandbox: bool = False, timeout: float = 15.0):
+    def __init__(self, timeout: float = 15.0):
         """Constructs ZenodoFetcher instance.
 
         Args:
-            sandbox (bool): controls whether production or sandbox zenodo backends
-                and associated DOIs should be used.
             timeout (float): timeout (in seconds) for http requests.
         """
-        backend = "sandbox" if sandbox else "production"
-        self._api_root = self.API_ROOT[backend]
-        self._token = self.TOKEN[backend]
-        self._dataset_to_doi = self.DOI[backend]
+        self._dataset_to_doi = self.DOI
         self._descriptor_cache: dict[str, DatapackageDescriptor] = {}
 
         self.timeout = timeout
@@ -229,8 +221,12 @@ def __init__(self, sandbox: bool = False, timeout: float = 15.0):
 
     def _fetch_from_url(self, url: str) -> requests.Response:
         logger.info(f"Retrieving {url} from zenodo")
+        if "sandbox" in url:
+            token = self.TOKEN["sandbox"]
+        else:
+            token = self.TOKEN["production"]
         response = self.http.get(
-            url, params={"access_token": self._token}, timeout=self.timeout
+            url, params={"access_token": token}, timeout=self.timeout
         )
         if response.status_code == requests.codes.ok:
             logger.debug(f"Successfully downloaded {url}")
@@ -240,16 +236,24 @@ def _fetch_from_url(self, url: str) -> requests.Response:
 
     def _doi_to_url(self, doi: str) -> str:
         """Returns url that holds the datapackage for given doi."""
-        match = re.search(r"zenodo.([\d]+)", doi)
-        if match is None:
-            raise ValueError(f"Invalid doi {doi}")
+        match = re.search(r"(10\.5072|10\.5281)/zenodo.([\d]+)", doi)
 
-        zen_id = int(match.groups()[0])
-        return f"{self._api_root}/deposit/depositions/{zen_id}"
+        if match is None:
+            raise ValueError(f"Invalid Zenodo DOI: {doi}")
+
+        doi_prefix = match.groups()[0]
+        zenodo_id = match.groups()[1]
+        if doi_prefix == "10.5072":
+            api_root = self.API_ROOT["sandbox"]
+        elif doi_prefix == "10.5281":
+            api_root = self.API_ROOT["production"]
+        else:
+            raise ValueError(f"Invalid Zenodo DOI: {doi}")
+        return f"{api_root}/deposit/depositions/{zenodo_id}"
 
     def get_descriptor(self, dataset: str) -> DatapackageDescriptor:
-        """Returns DatapackageDescriptor for given dataset."""
-        doi = self._dataset_to_doi.get(dataset)
+        """Returns class:`DatapackageDescriptor` for given dataset."""
+        doi = self._dataset_to_doi.get(dataset, False)
         if not doi:
             raise KeyError(f"No doi found for dataset {dataset}")
         if doi not in self._descriptor_cache:
@@ -295,22 +299,18 @@ def __init__(
         self,
         local_cache_path: Path | None = None,
         gcs_cache_path: str | None = None,
-        sandbox: bool = False,
-        timeout: float = 15,
+        timeout: float = 15.0,
     ):
         # TODO(rousik): figure out an efficient way to configure datastore caching
         """Datastore manages file retrieval for PUDL datasets.
 
         Args:
-            local_cache_path (Path): if provided, LocalFileCache pointed at the data
+            local_cache_path: if provided, LocalFileCache pointed at the data
               subdirectory of this path will be used with this Datastore.
-            gcs_cache_path (str): if provided, GoogleCloudStorageCache will be used
+            gcs_cache_path: if provided, GoogleCloudStorageCache will be used
               to retrieve data files. The path is expected to have the following
               format: gs://bucket[/path_prefix]
-            sandbox (bool): if True, use sandbox zenodo backend when retrieving files,
-              otherwise use production. This affects which zenodo servers are contacted
-              as well as dois used for each dataset.
-            timeout (floaTR): connection timeouts (in seconds) to use when connecting
+            timeout: connection timeouts (in seconds) to use when connecting
               to Zenodo servers.
         """
         self._cache = resource_cache.LayeredCache()
@@ -332,7 +332,7 @@ def __init__(
                 )
                 pass
 
-        self._zenodo_fetcher = ZenodoFetcher(sandbox=sandbox, timeout=timeout)
+        self._zenodo_fetcher = ZenodoFetcher(timeout=timeout)
 
     def get_known_datasets(self) -> list[str]:
         """Returns list of supported datasets."""
@@ -442,17 +442,11 @@ def __call__(self, parser, namespace, values, option_string=None):
 
 def parse_command_line():
     """Collect the command line arguments."""
-    prod_dois = "\n".join(
-        [f"    - {x}" for x in ZenodoFetcher.DOI["production"].keys()]
-    )
-    sand_dois = "\n".join([f"    - {x}" for x in ZenodoFetcher.DOI["sandbox"].keys()])
+    dois = "\n".join([f"    - {x}" for x in ZenodoFetcher.DOI])
 
     dataset_msg = f"""
 Available Production Datasets:
-{prod_dois}
-
-Available Sandbox Datasets:
-{sand_dois}"""
+{dois}"""
 
     parser = argparse.ArgumentParser(
         description="Download and cache ETL source data from Zenodo.",
@@ -463,12 +457,12 @@ def parse_command_line():
     parser.add_argument(
         "--dataset",
         help="Download the specified dataset only. See below for available options. "
-        "The default is to download all, which may take an hour or more."
-        "speed.",
+        "The default is to download all datasets, which may take hours depending on "
+        "network speed.",
     )
     parser.add_argument(
         "--pudl_in",
-        help="Override pudl_in directory, defaults to setting in ~/.pudl.yml",
+        help="Input directory to use, overridng the $PUDL_INPUT environment variable.",
     )
     parser.add_argument(
         "--validate",
@@ -476,12 +470,6 @@ def parse_command_line():
         action="store_true",
         default=False,
     )
-    parser.add_argument(
-        "--sandbox",
-        help="Download data from Zenodo sandbox server. For testing purposes only.",
-        action="store_true",
-        default=False,
-    )
     parser.add_argument(
         "--loglevel",
         help="Set logging level (DEBUG, INFO, WARNING, ERROR, or CRITICAL).",
@@ -602,7 +590,6 @@ def main():
 
     dstore = Datastore(
         gcs_cache_path=args.gcs_cache_path,
-        sandbox=args.sandbox,
         local_cache_path=cache_path,
     )
 
diff --git a/test/conftest.py b/test/conftest.py
index 01c29705a4..3e93a98489 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -67,12 +67,6 @@ def pytest_addoption(parser):
         default=False,
         help="If enabled, the local file cache for datastore will not be used.",
     )
-    parser.addoption(
-        "--sandbox",
-        action="store_true",
-        default=False,
-        help="Use raw inputs from the Zenodo sandbox server.",
-    )
     parser.addoption(
         "--save-unmapped-ids",
         action="store_true",
@@ -416,7 +410,6 @@ def pudl_datastore_config(request):
     return {
         "gcs_cache_path": gcs_cache_path if gcs_cache_path else "",
         "use_local_cache": not request.config.getoption("--bypass-local-cache"),
-        "sandbox": request.config.getoption("--sandbox"),
     }
 
 
diff --git a/test/integration/zenodo_datapackage_test.py b/test/integration/zenodo_datapackage_test.py
index 9cbb7d6ba2..a0a02593f7 100644
--- a/test/integration/zenodo_datapackage_test.py
+++ b/test/integration/zenodo_datapackage_test.py
@@ -8,24 +8,19 @@
 
 
 class TestZenodoDatapackages:
-    """Ensure production & sandbox Datastores point to valid datapackages."""
+    """Ensure all DOIs in Datastore point to valid datapackages."""
 
     @pytest.mark.xfail(
-        raises=(MaxRetryError, ConnectionError, RetryError, ResponseError)
-    )
-    def test_sandbox_datapackages(self):
-        """All datasets point to valid descriptors with 1 or more resources."""
-        ds = Datastore(sandbox=True)
-        for dataset in ds.get_known_datasets():
-            desc = ds.get_datapackage_descriptor(dataset)
-            assert list(desc.get_resources())
-
-    @pytest.mark.xfail(
-        raises=(MaxRetryError, ConnectionError, RetryError, ResponseError)
+        raises=(
+            MaxRetryError,
+            ConnectionError,
+            RetryError,
+            ResponseError,
+        )
     )
     def test_prod_datapackages(self):
         """All datasets point to valid descriptors with 1 or more resources."""
-        ds = Datastore(sandbox=False)
+        ds = Datastore()
         for dataset in ds.get_known_datasets():
             desc = ds.get_datapackage_descriptor(dataset)
             assert list(desc.get_resources())
diff --git a/test/unit/workspace/datastore_test.py b/test/unit/workspace/datastore_test.py
index 38672b44de..323f21003e 100644
--- a/test/unit/workspace/datastore_test.py
+++ b/test/unit/workspace/datastore_test.py
@@ -235,35 +235,30 @@ def setUp(self):
             }
         )
 
-    def test_sandbox_doi_format_is_correct(self):
-        """Verifies that sandbox ZenodoFetcher DOIs have the right format."""
-        ds = datastore.ZenodoFetcher(sandbox=True)
-        self.assertTrue(ds.get_known_datasets())
-        for dataset in ds.get_known_datasets():
-            print(f"doi for {dataset} is {ds.get_doi(dataset)}")
-            self.assertTrue(
-                re.fullmatch(
-                    r"10\.5072/zenodo\.[0-9]{5,10}", ds.get_doi(dataset)
-                ),  # noqa: FS003
-                msg=f"doi for {dataset} is {ds.get_doi(dataset)}",
-            )
+    def test_doi_format_is_correct(self):
+        """Verifies ZenodoFetcher DOIs have correct format and are not sandbox DOIs.
 
-    def test_prod_doi_format_is_correct(self):
-        """Verifies that production ZenodoFetcher DOIs have the right format."""
-        ds = datastore.ZenodoFetcher(sandbox=False)
+        Sandbox DOIs are only meant for use in testing and development, and should not
+        be checked in, thus this test will fail if a sandbox DOI with prefix 10.5072 is
+        identified.
+        """
+        ds = datastore.ZenodoFetcher()
         self.assertTrue(ds.get_known_datasets())
         for dataset in ds.get_known_datasets():
+            doi = ds.get_doi(dataset)
+            self.assertFalse(
+                re.fullmatch(r"10\.5072/zenodo\.[0-9]{5,10}", doi),
+                msg=f"Zenodo sandbox DOI found for {dataset}: {doi}",
+            )
             self.assertTrue(
-                re.fullmatch(
-                    r"10\.5281/zenodo\.[0-9]{5,10}", ds.get_doi(dataset)
-                ),  # noqa: FS003
-                msg=f"doi for {dataset} is {ds.get_doi(dataset)}",
+                re.fullmatch(r"10\.5281/zenodo\.[0-9]{5,10}", doi),
+                msg=f"Zenodo production DOI for {dataset} is {doi}",
             )
 
     def test_get_known_datasets(self):
         """Call to get_known_datasets() produces the expected results."""
         self.assertEqual(
-            sorted(datastore.ZenodoFetcher.DOI["production"]),
+            sorted(datastore.ZenodoFetcher.DOI),
             self.fetcher.get_known_datasets(),
         )
 

From 0c3c0502bbe4f05e1b0598c39936634e3cb75ce4 Mon Sep 17 00:00:00 2001
From: Zane Selvans <zane.selvans@catalyst.coop>
Date: Mon, 21 Aug 2023 10:27:26 -0600
Subject: [PATCH 2/7] Integrate some Pydantic validation into ZenodoFetcher

---
 src/pudl/workspace/datastore.py       | 123 ++++++++++++++------------
 test/unit/workspace/datastore_test.py |   5 +-
 2 files changed, 68 insertions(+), 60 deletions(-)

diff --git a/src/pudl/workspace/datastore.py b/src/pudl/workspace/datastore.py
index 78519aeffb..0c0895328a 100644
--- a/src/pudl/workspace/datastore.py
+++ b/src/pudl/workspace/datastore.py
@@ -9,11 +9,12 @@
 from collections import defaultdict
 from collections.abc import Iterator
 from pathlib import Path
-from typing import Any
+from typing import Any, Self
 
 import datapackage
 import requests
 from google.auth.exceptions import DefaultCredentialsError
+from pydantic import BaseModel, HttpUrl, confloat, constr
 from requests.adapters import HTTPAdapter
 from requests.packages.urllib3.util.retry import Retry
 
@@ -29,6 +30,7 @@
 # long as we stick to read-only keys.
 
 PUDL_YML = Path.home() / ".pudl.yml"
+ZenodoDOI = constr(regex=r"(10\.5072|10\.5281)/zenodo.([\d]+)")
 
 
 class ChecksumMismatch(ValueError):
@@ -154,19 +156,13 @@ def get_json_string(self) -> str:
         return json.dumps(self.datapackage_json, sort_keys=True, indent=4)
 
 
-class ZenodoFetcher:
+class ZenodoFetcher(BaseModel):
     """API for fetching datapackage descriptors and resource contents from zenodo."""
 
-    # Zenodo tokens recorded here should have read-only access to our archives.
-    # Including them here is correct in order to allow public use of this tool, so
-    # long as we stick to read-only keys.
-    TOKEN = {
-        # Read-only personal access tokens for pudl@catalyst.coop:
-        "sandbox": "qyPC29wGPaflUUVAv1oGw99ytwBqwEEdwi4NuUrpwc3xUcEwbmuB4emwysco",
-        "production": "KXcG5s9TqeuPh1Ukt5QYbzhCElp9LxuqAuiwdqHP0WS4qGIQiydHn6FBtdJ5",
-    }
-
-    DOI = {
+    _descriptor_cache: dict[str, DatapackageDescriptor] = {}
+    http: requests.Session = requests.Session()
+    timeout: confloat(gt=0.0, allow_inf_nan=False) = 15.0
+    zenodo_dois: dict[str, ZenodoDOI] = {
         # Sandbox DOIs are provided for reference
         "censusdp1tract": "10.5281/zenodo.4127049",
         # "censusdp1tract": "10.5072/zenodo.674992",
@@ -195,47 +191,45 @@ class ZenodoFetcher:
         "ferc714": "10.5281/zenodo.7139875",
         # "ferc714": "10.5072/zenodo.1098302",
     }
-    API_ROOT = {
-        "sandbox": "https://sandbox.zenodo.org/api",
-        "production": "https://zenodo.org/api",
-    }
 
-    def __init__(self, timeout: float = 15.0):
-        """Constructs ZenodoFetcher instance.
+    class Config:
+        """Allow arbitrary types -- required for requests.Session."""
 
-        Args:
-            timeout (float): timeout (in seconds) for http requests.
-        """
-        self._dataset_to_doi = self.DOI
-        self._descriptor_cache: dict[str, DatapackageDescriptor] = {}
+        arbitrary_types_allowed = True
+
+    def __init__(self: Self, **data):
+        """Constructs ZenodoFetcher instance."""
+        super().__init__(**data)
 
-        self.timeout = timeout
         retries = Retry(
             backoff_factor=2, total=3, status_forcelist=[429, 500, 502, 503, 504]
         )
         adapter = HTTPAdapter(max_retries=retries)
 
-        self.http = requests.Session()
         self.http.mount("http://", adapter)
         self.http.mount("https://", adapter)
+        for dataset in self.zenodo_dois:
+            try:
+                ZenodoDOI.validate(self.zenodo_dois[dataset])
+            except Exception:
+                raise ValueError(
+                    f"Invalid Zenodo DOI for {dataset}: {self.zenodo_dois[dataset]}"
+                )
 
-    def _fetch_from_url(self, url: str) -> requests.Response:
-        logger.info(f"Retrieving {url} from zenodo")
+    def _get_token(self: Self, url: HttpUrl) -> str:
+        """Return the appropriate read-only Zenodo personal access token.
+
+        These tokens are associated with the pudl@catalyst.coop Zenodo account, which
+        owns all of the Catalyst raw data archives.
+        """
         if "sandbox" in url:
-            token = self.TOKEN["sandbox"]
-        else:
-            token = self.TOKEN["production"]
-        response = self.http.get(
-            url, params={"access_token": token}, timeout=self.timeout
-        )
-        if response.status_code == requests.codes.ok:
-            logger.debug(f"Successfully downloaded {url}")
-            return response
+            token = "qyPC29wGPaflUUVAv1oGw99ytwBqwEEdwi4NuUrpwc3xUcEwbmuB4emwysco"  # nosec: B105
         else:
-            raise ValueError(f"Could not download {url}: {response.text}")
+            token = "KXcG5s9TqeuPh1Ukt5QYbzhCElp9LxuqAuiwdqHP0WS4qGIQiydHn6FBtdJ5"  # nosec: B105
+        return token
 
-    def _doi_to_url(self, doi: str) -> str:
-        """Returns url that holds the datapackage for given doi."""
+    def _get_url(self: Self, doi: ZenodoDOI) -> HttpUrl:
+        """Construct a Zenodo depsition URL based on its Zenodo DOI."""
         match = re.search(r"(10\.5072|10\.5281)/zenodo.([\d]+)", doi)
 
         if match is None:
@@ -244,20 +238,29 @@ def _doi_to_url(self, doi: str) -> str:
         doi_prefix = match.groups()[0]
         zenodo_id = match.groups()[1]
         if doi_prefix == "10.5072":
-            api_root = self.API_ROOT["sandbox"]
+            api_root = "https://sandbox.zenodo.org/api"
         elif doi_prefix == "10.5281":
-            api_root = self.API_ROOT["production"]
+            api_root = "https://zenodo.org/api"
         else:
             raise ValueError(f"Invalid Zenodo DOI: {doi}")
         return f"{api_root}/deposit/depositions/{zenodo_id}"
 
-    def get_descriptor(self, dataset: str) -> DatapackageDescriptor:
+    def _fetch_from_url(self: Self, url: HttpUrl) -> requests.Response:
+        logger.info(f"Retrieving {url} from zenodo")
+        response = self.http.get(
+            url, params={"access_token": self._get_token(url)}, timeout=self.timeout
+        )
+        if response.status_code == requests.codes.ok:
+            logger.debug(f"Successfully downloaded {url}")
+            return response
+        else:
+            raise ValueError(f"Could not download {url}: {response.text}")
+
+    def get_descriptor(self: Self, dataset: str) -> DatapackageDescriptor:
         """Returns class:`DatapackageDescriptor` for given dataset."""
-        doi = self._dataset_to_doi.get(dataset, False)
-        if not doi:
-            raise KeyError(f"No doi found for dataset {dataset}")
+        doi = self.get_doi(dataset)
         if doi not in self._descriptor_cache:
-            dpkg = self._fetch_from_url(self._doi_to_url(doi))
+            dpkg = self._fetch_from_url(self._get_url(doi))
             for f in dpkg.json()["files"]:
                 if f["filename"] == "datapackage.json":
                     resp = self._fetch_from_url(f["links"]["download"])
@@ -271,15 +274,19 @@ def get_descriptor(self, dataset: str) -> DatapackageDescriptor:
                 )
         return self._descriptor_cache[doi]
 
-    def get_resource_key(self, dataset: str, name: str) -> PudlResourceKey:
-        """Returns PudlResourceKey for given resource."""
-        return PudlResourceKey(dataset, self._dataset_to_doi[dataset], name)
+    def get_resource_key(self: Self, dataset: str, name: str) -> PudlResourceKey:
+        """Returns :class:`PudlResourceKey` for given resource."""
+        return PudlResourceKey(dataset, self.get_doi(dataset), name)
 
-    def get_doi(self, dataset: str) -> str:
+    def get_doi(self: Self, dataset: str) -> ZenodoDOI:
         """Returns DOI for given dataset."""
-        return self._dataset_to_doi[dataset]
+        try:
+            doi = self.zenodo_dois[dataset]
+        except KeyError:
+            raise KeyError(f"No Zenodo DOI found for datast {dataset}.")
+        return doi
 
-    def get_resource(self, res: PudlResourceKey) -> bytes:
+    def get_resource(self: Self, res: PudlResourceKey) -> bytes:
         """Given resource key, retrieve contents of the file from zenodo."""
         desc = self.get_descriptor(res.dataset)
         url = desc.get_resource_path(res.name)
@@ -287,9 +294,9 @@ def get_resource(self, res: PudlResourceKey) -> bytes:
         desc.validate_checksum(res.name, content)
         return content
 
-    def get_known_datasets(self) -> list[str]:
+    def get_known_datasets(self: Self) -> list[str]:
         """Returns list of supported datasets."""
-        return sorted(self._dataset_to_doi)
+        return sorted(self.zenodo_dois)
 
 
 class Datastore:
@@ -442,11 +449,13 @@ def __call__(self, parser, namespace, values, option_string=None):
 
 def parse_command_line():
     """Collect the command line arguments."""
-    dois = "\n".join([f"    - {x}" for x in ZenodoFetcher.DOI])
+    known_datasets = "\n".join(
+        [f"    - {x}" for x in ZenodoFetcher().get_known_datasets()]
+    )
 
     dataset_msg = f"""
-Available Production Datasets:
-{dois}"""
+Available Datasets:
+{known_datasets}"""
 
     parser = argparse.ArgumentParser(
         description="Download and cache ETL source data from Zenodo.",
diff --git a/test/unit/workspace/datastore_test.py b/test/unit/workspace/datastore_test.py
index 323f21003e..7c530584df 100644
--- a/test/unit/workspace/datastore_test.py
+++ b/test/unit/workspace/datastore_test.py
@@ -189,8 +189,7 @@ def __init__(
         self, descriptors: dict[str, datastore.DatapackageDescriptor], **kwargs
     ):
         """Construct a test-friendly ZenodoFetcher with descriptors pre-loaded."""
-        super().__init__(**kwargs)
-        self._descriptor_cache = dict(descriptors)
+        super().__init__(**kwargs, _descriptor_cache=descriptors)
 
 
 class TestZenodoFetcher(unittest.TestCase):
@@ -258,7 +257,7 @@ def test_doi_format_is_correct(self):
     def test_get_known_datasets(self):
         """Call to get_known_datasets() produces the expected results."""
         self.assertEqual(
-            sorted(datastore.ZenodoFetcher.DOI),
+            sorted(datastore.ZenodoFetcher().zenodo_dois),
             self.fetcher.get_known_datasets(),
         )
 

From c4b13fc86aed7a36d5c4796b472d559a22b1bc76 Mon Sep 17 00:00:00 2001
From: Zane Selvans <zane.selvans@catalyst.coop>
Date: Mon, 21 Aug 2023 13:51:19 -0600
Subject: [PATCH 3/7] Update allowed tox versions.

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 79a5ca3b18..caf97cac01 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -119,7 +119,7 @@ dev = [
     "isort>=5.0,<5.13",
     "jedi>=0.18,<0.20",
     "lxml>=4.6,<4.10",
-    "tox>=4,<4.7",
+    "tox>=4,<4.11",
     "twine>=3.3,<4.1",
 ]
 doc = [
@@ -153,7 +153,7 @@ test = [
     "pytest>=6.2,<7.5",
     "responses>=0.14,<0.24",
     "rstcheck[sphinx]>=5.0,<6.2",
-    "tox>=4.0,<4.10",
+    "tox>=4.0,<4.11",
 ]
 datasette = [
     "datasette>=0.60,<0.65",

From d840c1dd922eec5272445b1fc5a3c8e4c65f7a16 Mon Sep 17 00:00:00 2001
From: Zane Selvans <zane.selvans@catalyst.coop>
Date: Fri, 25 Aug 2023 20:25:12 -0600
Subject: [PATCH 4/7] Create a ZenodoDoiSettings Pydantic BaseSettings class.

---
 src/pudl/workspace/datastore.py       | 136 +++++++++++++-------------
 test/unit/workspace/datastore_test.py |   7 +-
 2 files changed, 72 insertions(+), 71 deletions(-)

diff --git a/src/pudl/workspace/datastore.py b/src/pudl/workspace/datastore.py
index 062cc471f5..1ff7c7ce5b 100644
--- a/src/pudl/workspace/datastore.py
+++ b/src/pudl/workspace/datastore.py
@@ -14,7 +14,7 @@
 import datapackage
 import requests
 from google.auth.exceptions import DefaultCredentialsError
-from pydantic import BaseModel, HttpUrl, confloat, constr
+from pydantic import BaseSettings, HttpUrl, constr
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 
@@ -25,12 +25,8 @@
 
 logger = pudl.logging_helpers.get_logger(__name__)
 
-# The Zenodo tokens recorded here should have read-only access to our archives.
-# Including them here is correct in order to allow public use of this tool, so
-# long as we stick to read-only keys.
-
 PUDL_YML = Path.home() / ".pudl.yml"
-ZenodoDOI = constr(regex=r"(10\.5072|10\.5281)/zenodo.([\d]+)")
+ZenodoDoi = constr(regex=r"(10\.5072|10\.5281)/zenodo.([\d]+)")
 
 
 class ChecksumMismatch(ValueError):
@@ -98,12 +94,12 @@ def _matches(self, res: dict, **filters: Any):
         )
 
     def get_resources(
-        self, name: str = None, **filters: Any
+        self: Self, name: str = None, **filters: Any
     ) -> Iterator[PudlResourceKey]:
         """Returns series of PudlResourceKey identifiers for matching resources.
 
         Args:
-          name (str): if specified, find resource(s) with this name.
+          name: if specified, find resource(s) with this name.
           filters (dict): if specified, find resoure(s) matching these key=value constraints.
             The constraints are matched against the 'parts' field of the resource
             entry in the datapackage.json.
@@ -156,65 +152,81 @@ def get_json_string(self) -> str:
         return json.dumps(self.datapackage_json, sort_keys=True, indent=4)
 
 
-class ZenodoFetcher(BaseModel):
-    """API for fetching datapackage descriptors and resource contents from zenodo."""
-
-    _descriptor_cache: dict[str, DatapackageDescriptor] = {}
-    http: requests.Session = requests.Session()
-    timeout: confloat(gt=0.0, allow_inf_nan=False) = 15.0
-    zenodo_dois: dict[str, ZenodoDOI] = {
-        # Sandbox DOIs are provided for reference
-        "censusdp1tract": "10.5281/zenodo.4127049",
-        # "censusdp1tract": "10.5072/zenodo.674992",
-        "eia860": "10.5281/zenodo.8164776",
-        # "eia860": "10.5072/zenodo.1222854",
-        "eia860m": "10.5281/zenodo.8188017",
-        # "eia860m": "10.5072/zenodo.1225517",
-        "eia861": "10.5281/zenodo.8231268",
-        # "eia861": "10.5072/zenodo.1229930",
-        "eia923": "10.5281/zenodo.8172818",
-        # "eia923": "10.5072/zenodo.1217724",
-        "eia_bulk_elec": "10.5281/zenodo.7067367",
-        # "eia_bulk_elec": "10.5072/zenodo.1103572",
-        "epacamd_eia": "10.5281/zenodo.7900974",
-        # "epacamd_eia": "10.5072/zenodo.1199170",
-        "epacems": "10.5281/zenodo.8235497",
-        # "epacems": "10.5072/zenodo.1228519",
-        "ferc1": "10.5281/zenodo.7314437",
-        # "ferc1": "10.5072/zenodo.1070868",
-        "ferc2": "10.5281/zenodo.8006881",
-        # "ferc2": "10.5072/zenodo.1188447",
-        "ferc6": "10.5281/zenodo.7130141",
-        # "ferc6": "10.5072/zenodo.1098088",
-        "ferc60": "10.5281/zenodo.7130146",
-        # "ferc60": "10.5072/zenodo.1098089",
-        "ferc714": "10.5281/zenodo.7139875",
-        # "ferc714": "10.5072/zenodo.1098302",
-    }
+class ZenodoDoiSettings(BaseSettings):
+    """Digital Object Identifiers pointing to currently used Zenodo archives."""
+
+    # Sandbox DOIs are provided for reference
+    censusdp1tract: ZenodoDoi = "10.5281/zenodo.4127049"
+    # censusdp1tract: ZenodoDoi = "10.5072/zenodo.674992"
+    eia860: ZenodoDoi = "10.5281/zenodo.8164776"
+    # eia860: ZenodoDoi = "10.5072/zenodo.1222854"
+    eia860m: ZenodoDoi = "10.5281/zenodo.8188017"
+    # eia860m: ZenodoDoi = "10.5072/zenodo.1225517"
+    eia861: ZenodoDoi = "10.5281/zenodo.8231268"
+    # eia861: ZenodoDoi = "10.5072/zenodo.1229930"
+    eia923: ZenodoDoi = "10.5281/zenodo.8172818"
+    # eia923: ZenodoDoi = "10.5072/zenodo.1217724"
+    eia_bulk_elec: ZenodoDoi = "10.5281/zenodo.7067367"
+    # eia_bulk_elec: ZenodoDoi = "10.5072/zenodo.1103572"
+    epacamd_eia: ZenodoDoi = "10.5281/zenodo.7900974"
+    # epacamd_eia: ZenodoDoi = "10.5072/zenodo.1199170"
+    epacems: ZenodoDoi = "10.5281/zenodo.8235497"
+    # epacems": ZenodoDoi = "10.5072/zenodo.1228519"
+    ferc1: ZenodoDoi = "10.5281/zenodo.7314437"
+    # ferc1: ZenodoDoi = 10.5072/zenodo.1070868"
+    ferc2: ZenodoDoi = "10.5281/zenodo.8006881"
+    # ferc2: ZenodoDoi = "10.5072/zenodo.1188447"
+    ferc6: ZenodoDoi = "10.5281/zenodo.7130141"
+    # ferc6: ZenodoDoi = "10.5072/zenodo.1098088"
+    ferc60: ZenodoDoi = "10.5281/zenodo.7130146"
+    # ferc60: ZenodoDoi = "10.5072/zenodo.1098089"
+    ferc714: ZenodoDoi = "10.5281/zenodo.7139875"
+    # ferc714: ZenodoDoi = "10.5072/zenodo.1098302"
 
     class Config:
-        """Allow arbitrary types -- required for requests.Session."""
+        """Pydantic config, reads from .env file."""
+
+        env_prefix = "pudl_zenodo_doi_"
+        env_file = ".env"
+
+
+class ZenodoFetcher:
+    """API for fetching datapackage descriptors and resource contents from zenodo."""
 
-        arbitrary_types_allowed = True
+    _descriptor_cache: dict[str, DatapackageDescriptor]
+    zenodo_dois: ZenodoDoiSettings
+    timeout: float
+    http: requests.Session
 
-    def __init__(self: Self, **data):
+    def __init__(
+        self: Self, zenodo_dois: ZenodoDoiSettings | None = None, timeout: float = 15.0
+    ):
         """Constructs ZenodoFetcher instance."""
-        super().__init__(**data)
+        if not zenodo_dois:
+            self.zenodo_dois = ZenodoDoiSettings()
+
+        self.timeout = timeout
 
         retries = Retry(
             backoff_factor=2, total=3, status_forcelist=[429, 500, 502, 503, 504]
         )
         adapter = HTTPAdapter(max_retries=retries)
-
+        self.http = requests.Session()
         self.http.mount("http://", adapter)
         self.http.mount("https://", adapter)
-        for dataset in self.zenodo_dois:
-            try:
-                ZenodoDOI.validate(self.zenodo_dois[dataset])
-            except Exception:
-                raise ValueError(
-                    f"Invalid Zenodo DOI for {dataset}: {self.zenodo_dois[dataset]}"
-                )
+        self._descriptor_cache = {}
+
+    def get_doi(self: Self, dataset: str) -> ZenodoDoi:
+        """Returns DOI for given dataset."""
+        try:
+            doi = self.zenodo_dois.__getattribute__(dataset)
+        except AttributeError:
+            raise AttributeError(f"No Zenodo DOI found for dataset {dataset}.")
+        return doi
+
+    def get_known_datasets(self: Self) -> list[str]:
+        """Returns list of supported datasets."""
+        return [name for name, doi in sorted(self.zenodo_dois)]
 
     def _get_token(self: Self, url: HttpUrl) -> str:
         """Return the appropriate read-only Zenodo personal access token.
@@ -228,7 +240,7 @@ def _get_token(self: Self, url: HttpUrl) -> str:
             token = "KXcG5s9TqeuPh1Ukt5QYbzhCElp9LxuqAuiwdqHP0WS4qGIQiydHn6FBtdJ5"  # nosec: B105
         return token
 
-    def _get_url(self: Self, doi: ZenodoDOI) -> HttpUrl:
+    def _get_url(self: Self, doi: ZenodoDoi) -> HttpUrl:
         """Construct a Zenodo depsition URL based on its Zenodo DOI."""
         match = re.search(r"(10\.5072|10\.5281)/zenodo.([\d]+)", doi)
 
@@ -278,14 +290,6 @@ def get_resource_key(self: Self, dataset: str, name: str) -> PudlResourceKey:
         """Returns :class:`PudlResourceKey` for given resource."""
         return PudlResourceKey(dataset, self.get_doi(dataset), name)
 
-    def get_doi(self: Self, dataset: str) -> ZenodoDOI:
-        """Returns DOI for given dataset."""
-        try:
-            doi = self.zenodo_dois[dataset]
-        except KeyError:
-            raise KeyError(f"No Zenodo DOI found for datast {dataset}.")
-        return doi
-
     def get_resource(self: Self, res: PudlResourceKey) -> bytes:
         """Given resource key, retrieve contents of the file from zenodo."""
         desc = self.get_descriptor(res.dataset)
@@ -294,10 +298,6 @@ def get_resource(self: Self, res: PudlResourceKey) -> bytes:
         desc.validate_checksum(res.name, content)
         return content
 
-    def get_known_datasets(self: Self) -> list[str]:
-        """Returns list of supported datasets."""
-        return sorted(self.zenodo_dois)
-
 
 class Datastore:
     """Handle connections and downloading of Zenodo Source archives."""
diff --git a/test/unit/workspace/datastore_test.py b/test/unit/workspace/datastore_test.py
index 8cf22be99e..f3982e5459 100644
--- a/test/unit/workspace/datastore_test.py
+++ b/test/unit/workspace/datastore_test.py
@@ -189,7 +189,8 @@ def __init__(
         self, descriptors: dict[str, datastore.DatapackageDescriptor], **kwargs
     ):
         """Construct a test-friendly ZenodoFetcher with descriptors pre-loaded."""
-        super().__init__(**kwargs, _descriptor_cache=descriptors)
+        super().__init__(**kwargs)
+        self._descriptor_cache = descriptors
 
 
 class TestZenodoFetcher(unittest.TestCase):
@@ -257,7 +258,7 @@ def test_doi_format_is_correct(self):
     def test_get_known_datasets(self):
         """Call to get_known_datasets() produces the expected results."""
         self.assertEqual(
-            sorted(datastore.ZenodoFetcher().zenodo_dois),
+            sorted(name for name, doi in datastore.ZenodoFetcher().zenodo_dois),
             self.fetcher.get_known_datasets(),
         )
 
@@ -296,7 +297,7 @@ def test_get_resource_key(self):
     def test_get_resource_key_for_unknown_dataset_fails(self):
         """When get_resource_key() is called for unknown dataset it throws KeyError."""
         self.assertRaises(
-            KeyError, self.fetcher.get_resource_key, "unknown", "blob.zip"
+            AttributeError, self.fetcher.get_resource_key, "unknown", "blob.zip"
         )
 
     @responses.activate

From 2d081f0dc70b201a6ca31108b758df4104037fbb Mon Sep 17 00:00:00 2001
From: Zane Selvans <zane.selvans@catalyst.coop>
Date: Fri, 25 Aug 2023 22:30:47 -0600
Subject: [PATCH 5/7] Update Zenodo DOI test to work better with new
 ZenodoDoiSettings

---
 test/unit/workspace/datastore_test.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/test/unit/workspace/datastore_test.py b/test/unit/workspace/datastore_test.py
index f3982e5459..d13d4754fd 100644
--- a/test/unit/workspace/datastore_test.py
+++ b/test/unit/workspace/datastore_test.py
@@ -242,10 +242,13 @@ def test_doi_format_is_correct(self):
         be checked in, thus this test will fail if a sandbox DOI with prefix 10.5072 is
         identified.
         """
-        ds = datastore.ZenodoFetcher()
-        self.assertTrue(ds.get_known_datasets())
-        for dataset in ds.get_known_datasets():
-            doi = ds.get_doi(dataset)
+        zf = datastore.ZenodoFetcher()
+        self.assertTrue(zf.get_known_datasets())
+        for dataset, doi in zf.zenodo_dois:
+            self.assertTrue(
+                zf.get_doi(dataset) == doi,
+                msg=f"Zenodo DOI for {dataset} matches result of get_doi()",
+            )
             self.assertFalse(
                 re.fullmatch(r"10\.5072/zenodo\.[0-9]{5,10}", doi),
                 msg=f"Zenodo sandbox DOI found for {dataset}: {doi}",

From b5889ced26fe66c7e95947d9e4c68030bb52cb2f Mon Sep 17 00:00:00 2001
From: Zane Selvans <zane.selvans@catalyst.coop>
Date: Tue, 29 Aug 2023 12:58:40 -0400
Subject: [PATCH 6/7] Update docs/dev/datastore.rst

Co-authored-by: Dazhong Xia <dazhong.xia@catalyst.coop>
---
 docs/dev/datastore.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/dev/datastore.rst b/docs/dev/datastore.rst
index eb3f2f452e..31e1e6f5b0 100644
--- a/docs/dev/datastore.rst
+++ b/docs/dev/datastore.rst
@@ -38,7 +38,7 @@ For more detailed usage information, see:
     $ pudl_datastore --help
 
 The downloaded data will be used by the script to populate a datastore under
-the your ``$PUDL_INPUT`` directory, organized by data source, form, and DOI::
+your ``$PUDL_INPUT`` directory, organized by data source, form, and DOI::
 
     data/censusdp1tract/
     data/eia860/

From 6ac9d9fd10c40dbe41ac10aa2e5295fd131a639e Mon Sep 17 00:00:00 2001
From: Zane Selvans <zane.selvans@catalyst.coop>
Date: Tue, 29 Aug 2023 11:52:52 -0600
Subject: [PATCH 7/7] Remove deprecated pudl_datastore --pudl_in option and
 unused get_resource_key() method

---
 src/pudl/workspace/datastore.py       | 11 -----------
 test/unit/workspace/datastore_test.py | 17 ++++-------------
 2 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/src/pudl/workspace/datastore.py b/src/pudl/workspace/datastore.py
index 1ff7c7ce5b..230cdb505c 100644
--- a/src/pudl/workspace/datastore.py
+++ b/src/pudl/workspace/datastore.py
@@ -286,10 +286,6 @@ def get_descriptor(self: Self, dataset: str) -> DatapackageDescriptor:
                 )
         return self._descriptor_cache[doi]
 
-    def get_resource_key(self: Self, dataset: str, name: str) -> PudlResourceKey:
-        """Returns :class:`PudlResourceKey` for given resource."""
-        return PudlResourceKey(dataset, self.get_doi(dataset), name)
-
     def get_resource(self: Self, res: PudlResourceKey) -> bytes:
         """Given resource key, retrieve contents of the file from zenodo."""
         desc = self.get_descriptor(res.dataset)
@@ -473,10 +469,6 @@ def parse_command_line():
         "The default is to download all datasets, which may take hours depending on "
         "network speed.",
     )
-    parser.add_argument(
-        "--pudl_in",
-        help="Input directory to use, overridng the $PUDL_INPUT environment variable.",
-    )
     parser.add_argument(
         "--validate",
         help="Validate locally cached datapackages, but don't download anything.",
@@ -594,9 +586,6 @@ def main():
         logfile=args.logfile, loglevel=args.loglevel
     )
 
-    if args.pudl_in:
-        PudlPaths.set_path_overrides(input_dir=args.pudl_in)
-
     cache_path = None
     if not args.bypass_local_cache:
         cache_path = PudlPaths().input_dir
diff --git a/test/unit/workspace/datastore_test.py b/test/unit/workspace/datastore_test.py
index d13d4754fd..df389b6fbf 100644
--- a/test/unit/workspace/datastore_test.py
+++ b/test/unit/workspace/datastore_test.py
@@ -265,6 +265,10 @@ def test_get_known_datasets(self):
             self.fetcher.get_known_datasets(),
         )
 
+    def test_get_unknown_dataset(self):
+        """Ensure that we get a failure when attempting to access an invalid dataset."""
+        self.assertRaises(AttributeError, self.fetcher.get_doi, "unknown")
+
     def test_doi_of_prod_epacems_matches(self):
         """Most of the tests assume specific DOI for production epacems dataset.
 
@@ -290,19 +294,6 @@ def test_get_descriptor_http_calls(self):
         self.assertEqual(self.MOCK_EPACEMS_DATAPACKAGE, desc.datapackage_json)
         # self.assertTrue(responses.assert_call_count("http://localhost/my/datapackage.json", 1))
 
-    def test_get_resource_key(self):
-        """Tests normal operation of get_resource_key()."""
-        self.assertEqual(
-            PudlResourceKey("epacems", self.PROD_EPACEMS_DOI, "blob.zip"),
-            self.fetcher.get_resource_key("epacems", "blob.zip"),
-        )
-
-    def test_get_resource_key_for_unknown_dataset_fails(self):
-        """When get_resource_key() is called for unknown dataset it throws KeyError."""
-        self.assertRaises(
-            AttributeError, self.fetcher.get_resource_key, "unknown", "blob.zip"
-        )
-
     @responses.activate
     def test_get_resource(self):
         """Test that get_resource() calls expected http request and returns content."""