Merge pull request #23 from catalyst-cooperative/dev

Switch to using requester pays on storage buckets. Merging this even though I need to integrate these comments into the documentation, to deal with the bot automerge stuff. Should have put it on a feature branch but oh well. 😬
catalyst-cooperative · May 24, 2022 · 43a0867 · 43a0867
2 parents 84eaaa8 + 77f9db5
commit 43a0867
Show file tree

Hide file tree

Showing 9 changed files with 191 additions and 74 deletions.
diff --git a/.github/workflows/pre-commit-ci-auto-merge.yaml b/.github/workflows/pre-commit-ci-auto-merge.yaml
@@ -0,0 +1,20 @@
+name: pre-commit-ci-auto-merge
+
+on:
+  workflow_run:
+    types:
+    - completed
+    workflows:
+    - 'tox-pytest'
+
+jobs:
+  bot-auto-merge:
+    name: Auto-merge passing pre-commit-ci PRs
+    runs-on: ubuntu-latest
+    steps:
+    - name: Auto-merge passing pre-commit-ci PRs
+      if: ${{ github.event.workflow_run.conclusion == 'success' }}
+      uses: ridedott/merge-me-action@v2
+      with:
+        GITHUB_LOGIN: pre-commit-ci
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml
@@ -3,7 +3,7 @@ name: tox-pytest
 on: [push, pull_request]
 
 jobs:
-  build:
+  ci-test:
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -15,6 +15,13 @@ jobs:
       with:
         fetch-depth: 2
 
+    - id: 'auth'
+      name: 'Authenticate to Google Cloud'
+      uses: 'google-github-actions/auth@v0'
+      with:
+        credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}'
+        export_environment_variables: true
+
     - name: Set up conda environment for testing
       uses: conda-incubator/[email protected]
       with:
@@ -42,6 +49,11 @@ jobs:
     - name: Upload test coverage report to CodeCov
       uses: codecov/codecov-action@v3
 
+  ci-notify:
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    needs: ci-test
+    steps:
     - name: Inform the Codemonkeys
       uses: 8398a7/action-slack@v3
       with:
@@ -52,12 +64,22 @@ jobs:
             username: 'action-slack',
             icon_emoji: ':octocat:',
             attachments: [{
-              color: '${{ job.status }}' === 'success' ? 'good' : '${{ job.status }}' === 'failure' ? 'danger' : 'warning',
-              text: `${process.env.AS_WORKFLOW}\n${process.env.AS_JOB} (${process.env.AS_COMMIT}) of ${process.env.AS_REPO}@${process.env.AS_REF} by ${process.env.AS_AUTHOR} ${{ job.status }} in ${process.env.AS_TOOK}`,
+              color: '${{ needs.ci-test.result }}' === 'success' ? 'good' : '${{ needs.ci-test.result }}' === 'failure' ? 'danger' : 'warning',
+              text: `${process.env.AS_REPO}@${process.env.AS_REF}\n ${process.env.AS_WORKFLOW} (${process.env.AS_COMMIT})\n by ${process.env.AS_AUTHOR}\n Status: ${{ needs.ci-test.result }}`,
             }]
           }
       env:
         GITHUB_TOKEN: ${{ github.token }} # required
         SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} # required
         MATRIX_CONTEXT: ${{ toJson(matrix) }} # required
-      if: ${{ always() && github.actor != 'dependabot[bot]' }}
+
+  dependabot-auto-merge:
+    runs-on: ubuntu-latest
+    needs: ci-test
+    if: ${{ needs.ci-test.result == 'success' }}
+    steps:
+      - name: Automerge passing dependabot PR
+        uses: ridedott/merge-me-action@v2
+        with:
+          GITHUB_LOGIN: dependabot  # For clarity only. dependabot is default.
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -55,13 +55,13 @@ repos:
 
 # Check for errors in restructuredtext (.rst) files under the doc hierarchy
 - repo: https://github.com/PyCQA/doc8
-  rev: 0.11.1
+  rev: 0.11.2
   hooks:
   - id: doc8
     args: [--config, tox.ini]
 
 - repo: https://github.com/myint/rstcheck
-  rev: v5.0.0
+  rev: v6.0.0a2
   hooks:
   - id: rstcheck
     additional_dependencies: [sphinx]

diff --git a/README.rst b/README.rst
@@ -50,55 +50,89 @@ Currently available datasets
 Future datasets
 ~~~~~~~~~~~~~~~
 
-* Raw FERC Form 1 DB (SQL) – `browse DB online <https://data.catalyst.coop/ferc1>`__
-* PUDL DB (SQL) – `browse DB online <https://data.catalyst.coop/pudl>`__
-* Census Demographic Profile 1 (SQL)
+* Raw FERC Form 1 DB (SQLite) -- `browse DB online <https://data.catalyst.coop/ferc1>`__
+* PUDL DB (SQLite) -- `browse DB online <https://data.catalyst.coop/pudl>`__
+* Census Demographic Profile 1 (SQLite)
 
 Ongoing Development
 -------------------
 
 Development is currently being organized under these epics in the main
 PUDL repo:
 
+* `Intake SQLite Driver <https://github.com/catalyst-cooperative/pudl/issues/1156>`__
 * `EPA CEMS Intake Catalog <https://github.com/catalyst-cooperative/pudl/issues/1564>`__
-* `Prototype SQLite Intake Catalog <https://github.com/catalyst-cooperative/pudl/issues/1156>`__
+* `PUDL Intake Catalog <https://github.com/catalyst-cooperative/pudl/issues/1179>`__
 
-See the `issues in this repository
+See the `issues in the pudl-catalog repository
 <https://github.com/catalyst-cooperative/pudl-catalog/issues>`__ for more
 detailed tasks.
 
-Planned data distribution system
+Usage
+-----
+
+Public data and "requester pays"
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-We’re in the process of implementing automated nightly builds of all of our data
-products for each development branch with new commits in the main PUDL
-repository. This will allow us to do exhaustive integration testing and data
-validation on a daily basis. If all of the tests and data validation pass, then
-a new version of the data products (SQLite databases and Parquet files) will be
-produced, and placed into cloud storage.
+The data we're publishing in the PUDL Catalog is publicly accessible and distributed
+under the permissive `CC-BY-4.0 <https://creativecommons.org/licenses/by/4.0>`__
+license. Catalyst covers the cost of storing the data in Google cloud storage buckets.
+However, there are also fees incurred when data leaves the Google cloud infrastructure.
+Depending where you're downloading from, it costs $0.10-0.20 (USD) per GB.
 
-These outputs will be made available via a data catalog on a corresponding
-branch in this ``pudl-catalog`` repository. Ingeneral only the catalogs and data
-resources corresponding to the ``HEAD`` of development and feature branches will
-be available. Releases that are tagged on the ``main`` branch will be retained
-long term.
+In order to be able to share large amounts of public data without being exposed to large
+unexpected bills from Google due to someone maliciously or accidentally downloading a
+large volume of data programmatically, we've set the cloud storage to use `requester
+pays <https://cloud.google.com/storage/docs/requester-pays>`__. This means the person
+downloading the data is responsible for those (modest) costs instead. Downloading all of
+the EPA CEMS, FERC 1, PUDL, and US Census data we're publishing from North America will
+cost around $0.75, and it will be cached locally so that it's not downloaded again until
+a new version is released.
 
-The idea is that for any released version of PUDL, you should also be able to
-install a corresponding data catalog, and know that the software and the data
-are compatible. You can also install just the data catalog with minimal
-dependencies, and not need to worry about the PUDL software that produced it at
-all, if you simply want to access the DBs or Parquet files directly.
+To set up a GCP billing project and use it for authentication when accessing the
+catalog:
 
-In development, this arrangement will mean that every morning you should have
-access to a fully processed set of data products that reflect the branch of code
-that you’re working on, rather than the data and code getting progressively
-further out of sync as you do development, until you take the time to re-run the
-full ETL locally yourself.
+* `Create a project on GCP <https://cloud.google.com/resource-manager/docs/creating-managing-projects#creating_a_project>`__;
+  if this is the first time using GCP, a prompt should appear asking you to choose which
+  Google account to use for your GCP-related activities. (You should also receive $300
+  in initial cloud credits.
+* `Create a Cloud Billing account <https://cloud.google.com/billing/docs/how-to/manage-billing-account#create_a_new_billing_account>`__
+  associated with the project and `enable billing for the project
+  <https://cloud.google.com/billing/docs/how-to/modify-project#enable_billing_for_a_project>`__
+  through this account.
+* `Using Google Cloud IAM <https://cloud.google.com/iam/docs/granting-changing-revoking-access#granting-console>`__,
+  add the **Service Usage Consumer** role to your account, which enables it to make
+  billed requests on the behalf of the project.
+* Install the `gcloud utilities <https://cloud.google.com/sdk/docs/install>`__ on your
+  computer. This can be done using ``conda`` (or ``mamba``):
+
+.. code:: bash
+
+  conda install -c conda-forge google-cloud-sdk
+
+* Initialize the ``gcloud`` command line interface, logging into the account used to
+  create the aforementioned project and selecting it as the default project; this will
+  allow the project to be used for requester pays access through the command line:
+
+.. code:: bash
+
+  gcloud auth login
+  gcloud init
+
+* Finally, use ``gcloud`` to establish application default credentials; this will allow
+  the project to be used for requester pays access through applications:
+
+.. code:: bash
+
+  gcloud auth application-default login
 
-Example Usage
--------------
+* To test whether your GCP account is set up correctly and authenticated you can run the
+  following command to list the contents of the cloud storage bucket containing the
+  intake catalog data:
 
-See the notebook included in this repository for more details.
+.. code:: bash
+
+   gsutil ls gs://intake.catalyst.coop
 
 Import Intake Catalogs
 ~~~~~~~~~~~~~~~~~~~~~~
@@ -222,11 +256,7 @@ on that dataframe to actually read the data and return a pandas dataframe:
        states=["ID", "CO", "TX"],
    )
    epacems_df = (
-       pudl_cat.hourly_emissions_epacems(
-           filters=filters
-           index=False,
-           split_row_groups=True,
-       )
+       pudl_cat.hourly_emissions_epacems(filters=filters)
        .to_dask()
        .compute()
    )
@@ -253,6 +283,37 @@ on that dataframe to actually read the data and return a pandas dataframe:
    469,4,2019-01-01 10:00:00+00:00,2019,CO,79,298,1.0,204.0,2129.2,126.2
    469,4,2019-01-01 11:00:00+00:00,2019,CO,79,298,1.0,204.0,2160.6,128.1
 
+See the Jupyter notebook included in this repository for more details.
+
+
+Planned data distribution system
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We’re in the process of implementing automated nightly builds of all of our data
+products for each development branch with new commits in the main PUDL
+repository. This will allow us to do exhaustive integration testing and data
+validation on a daily basis. If all of the tests and data validation pass, then
+a new version of the data products (SQLite databases and Parquet files) will be
+produced, and placed into cloud storage.
+
+These outputs will be made available via a data catalog on a corresponding
+branch in this ``pudl-catalog`` repository. Ingeneral only the catalogs and data
+resources corresponding to the ``HEAD`` of development and feature branches will
+be available. Releases that are tagged on the ``main`` branch will be retained
+long term.
+
+The idea is that for any released version of PUDL, you should also be able to
+install a corresponding data catalog, and know that the software and the data
+are compatible. You can also install just the data catalog with minimal
+dependencies, and not need to worry about the PUDL software that produced it at
+all, if you simply want to access the DBs or Parquet files directly.
+
+In development, this arrangement will mean that every morning you should have
+access to a fully processed set of data products that reflect the branch of code
+that you’re working on, rather than the data and code getting progressively
+further out of sync as you do development, until you take the time to re-run the
+full ETL locally yourself.
+
 Benefits of Intake Catalogs
 ---------------------------
 

diff --git a/notebooks/pudl-catalog.ipynb b/notebooks/pudl-catalog.ipynb
@@ -51,7 +51,8 @@
     "from pudl_catalog.helpers import year_state_filter\n",
     "\n",
     "TEST_YEARS = [2019, 2020]\n",
-    "TEST_STATES = [\"ID\", \"CO\", \"TX\"]"
+    "TEST_STATES = [\"ID\", \"ME\"]\n",
+    "TEST_FILTERS = year_state_filter(years=TEST_YEARS, states=TEST_STATES)"
    ]
   },
   {
@@ -117,7 +118,7 @@
    "source": [
     "%%time\n",
     "# This takes forever and downloads the whole dataset\n",
-    "pudl_cat.hourly_emissions_epacems.discover()"
+    "pudl_cat.hourly_emissions_epacems().discover()"
    ]
   },
   {
@@ -136,16 +137,11 @@
     "%%time\n",
     "print(f\"Reading data from {os.getenv('PUDL_INTAKE_PATH')}\")\n",
     "print(f\"Caching data to {os.getenv('PUDL_INTAKE_CACHE')}\")\n",
-    "filters = year_state_filter(\n",
-    "    years=TEST_YEARS,\n",
-    "    states=TEST_STATES,\n",
-    ")\n",
-    "display(filters)\n",
+    "display(TEST_FILTERS)\n",
     "epacems_df = (\n",
-    "    pudl_cat.hourly_emissions_epacems(\n",
-    "        filters=filters,\n",
-    "    )\n",
-    "    .to_dask().compute()\n",
+    "    pudl_cat.hourly_emissions_epacems(filters=TEST_FILTERS)\n",
+    "    .to_dask()\n",
+    "    .compute()\n",
     ")"
    ]
   },
@@ -181,7 +177,10 @@
    "outputs": [],
    "source": [
     "%%time\n",
-    "df1 = pd.read_parquet(f\"{os.environ['PUDL_INTAKE_PATH']}/hourly_emissions_epacems/epacems-2020-ID.parquet\")"
+    "df1 = pd.read_parquet(\n",
+    "    f\"{os.getenv('PUDL_INTAKE_PATH')}/hourly_emissions_epacems/epacems-2020-ID.parquet\",\n",
+    "    storage_options={\"requester_pays\": True},\n",
+    ")"
    ]
   },
   {
@@ -191,7 +190,9 @@
    "outputs": [],
    "source": [
     "%%time\n",
-    "df2 = pudl_cat.hourly_emissions_epacems(filters=year_state_filter(years=[2020], states=[\"ID\"])).to_dask().compute()"
+    "df2 = pudl_cat.hourly_emissions_epacems(\n",
+    "    filters=year_state_filter(years=[2020], states=[\"ID\"])\n",
+    ").to_dask().compute()"
    ]
   },
   {
@@ -221,7 +222,7 @@
     "import fsspec\n",
     "epacems_pq = pq.read_table(\n",
     "    f\"{os.environ['PUDL_INTAKE_PATH']}/hourly_emissions_epacems/epacems-2020-ID.parquet\",\n",
-    "    filesystem=fsspec.filesystem(\"gs\"),\n",
+    "    filesystem=fsspec.filesystem(\"gs\", requester_pays=True),\n",
     ")\n",
     "dtype_dict = {name: dtype for name, dtype in zip(epacems_pq.schema.names, epacems_pq.schema.types)}\n",
     "pprint(dtype_dict, indent=4, sort_dicts=False)"

diff --git a/setup.py b/setup.py
@@ -45,10 +45,10 @@
     zip_safe=False,
     python_requires=">=3.8,<3.11",
     install_requires=[
-        "gcsfs>=2022,<2023",
-        "intake>=0.6.5",
-        "intake_parquet>=0.2.3",
-        "intake_sql>=0.3.1",
+        "gcsfs>=2021.7,<2022.3.1",
+        "intake>=0.6.5,<0.7",
+        "intake_parquet>=0.2.3,<0.3",
+        "intake_sql>=0.3.1,<0.4",
         "msgpack>=1,<2",
         "pandas>=1.4,<1.5",
     ],
@@ -83,7 +83,7 @@
             "pydocstyle>=5.1,<7",  # Style guidelines for Python documentation
             "pytest>=6.2,<8",  # Our testing framework
             "pytest-cov>=2.10,<4.0",  # Pytest plugin for working with coverage
-            "rstcheck>=5,<6",  # ReStructuredText linter
+            "rstcheck[sphinx]>=5,<6",  # ReStructuredText linter
             "tox>=3.20,<4",  # Python test environment manager
         ],
     },

diff --git a/src/pudl_catalog/pudl_catalog.yaml b/src/pudl_catalog/pudl_catalog.yaml
@@ -38,9 +38,13 @@ sources:
         path: "https://creativecommons.org/licenses/by/4.0"
     args: # These arguments are for dask.dataframe.read_parquet()
       engine: 'pyarrow'
+      split_row_groups: true
+      index: false
       urlpath: '{{ cache_method }}{{ env(PUDL_INTAKE_PATH) }}/hourly_emissions_epacems.parquet'
       storage_options:
-        token: 'anon'  # Explicitly use anonymous access.
+        requester_pays: true
+        gs:
+          requester_pays: true
         simplecache:
           cache_storage: '{{ env(PUDL_INTAKE_CACHE) }}'
 
@@ -62,9 +66,13 @@ sources:
         path: "https://creativecommons.org/licenses/by/4.0"
     args: # These arguments are for dask.dataframe.read_parquet()
       engine: 'pyarrow'
+      split_row_groups: true
+      index: false
       urlpath: '{{ cache_method }}{{ env(PUDL_INTAKE_PATH) }}/hourly_emissions_epacems/*.parquet'
       storage_options:
-        token: 'anon'  # Explicitly use anonymous access.
+        requester_pays: true
+        gs:
+          requester_pays: true
         simplecache:
           cache_storage: '{{ env(PUDL_INTAKE_CACHE) }}'