diff --git a/.github/workflows/pre-commit-ci-auto-merge.yaml b/.github/workflows/pre-commit-ci-auto-merge.yaml new file mode 100644 index 0000000..099c42f --- /dev/null +++ b/.github/workflows/pre-commit-ci-auto-merge.yaml @@ -0,0 +1,20 @@ +name: pre-commit-ci-auto-merge + +on: + workflow_run: + types: + - completed + workflows: + - 'tox-pytest' + +jobs: + bot-auto-merge: + name: Auto-merge passing pre-commit-ci PRs + runs-on: ubuntu-latest + steps: + - name: Auto-merge passing pre-commit-ci PRs + if: ${{ github.event.workflow_run.conclusion == 'success' }} + uses: ridedott/merge-me-action@v2 + with: + GITHUB_LOGIN: pre-commit-ci + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml index b6070b7..4fb6e82 100644 --- a/.github/workflows/tox-pytest.yml +++ b/.github/workflows/tox-pytest.yml @@ -3,7 +3,7 @@ name: tox-pytest on: [push, pull_request] jobs: - build: + ci-test: runs-on: ubuntu-latest strategy: matrix: @@ -15,6 +15,13 @@ jobs: with: fetch-depth: 2 + - id: 'auth' + name: 'Authenticate to Google Cloud' + uses: 'google-github-actions/auth@v0' + with: + credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}' + export_environment_variables: true + - name: Set up conda environment for testing uses: conda-incubator/setup-miniconda@v2.1.1 with: @@ -42,6 +49,11 @@ jobs: - name: Upload test coverage report to CodeCov uses: codecov/codecov-action@v3 + ci-notify: + runs-on: ubuntu-latest + if: ${{ always() }} + needs: ci-test + steps: - name: Inform the Codemonkeys uses: 8398a7/action-slack@v3 with: @@ -52,12 +64,22 @@ jobs: username: 'action-slack', icon_emoji: ':octocat:', attachments: [{ - color: '${{ job.status }}' === 'success' ? 'good' : '${{ job.status }}' === 'failure' ? 'danger' : 'warning', - text: `${process.env.AS_WORKFLOW}\n${process.env.AS_JOB} (${process.env.AS_COMMIT}) of ${process.env.AS_REPO}@${process.env.AS_REF} by ${process.env.AS_AUTHOR} ${{ job.status }} in ${process.env.AS_TOOK}`, + color: '${{ needs.ci-test.result }}' === 'success' ? 'good' : '${{ needs.ci-test.result }}' === 'failure' ? 'danger' : 'warning', + text: `${process.env.AS_REPO}@${process.env.AS_REF}\n ${process.env.AS_WORKFLOW} (${process.env.AS_COMMIT})\n by ${process.env.AS_AUTHOR}\n Status: ${{ needs.ci-test.result }}`, }] } env: GITHUB_TOKEN: ${{ github.token }} # required SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} # required MATRIX_CONTEXT: ${{ toJson(matrix) }} # required - if: ${{ always() && github.actor != 'dependabot[bot]' }} + + dependabot-auto-merge: + runs-on: ubuntu-latest + needs: ci-test + if: ${{ needs.ci-test.result == 'success' }} + steps: + - name: Automerge passing dependabot PR + uses: ridedott/merge-me-action@v2 + with: + GITHUB_LOGIN: dependabot # For clarity only. dependabot is default. + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 15aca97..df9ada2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -55,13 +55,13 @@ repos: # Check for errors in restructuredtext (.rst) files under the doc hierarchy - repo: https://github.com/PyCQA/doc8 - rev: 0.11.1 + rev: 0.11.2 hooks: - id: doc8 args: [--config, tox.ini] - repo: https://github.com/myint/rstcheck - rev: v5.0.0 + rev: v6.0.0a2 hooks: - id: rstcheck additional_dependencies: [sphinx] diff --git a/README.rst b/README.rst index c6b1091..bfac29a 100644 --- a/README.rst +++ b/README.rst @@ -50,9 +50,9 @@ Currently available datasets Future datasets ~~~~~~~~~~~~~~~ -* Raw FERC Form 1 DB (SQL) – `browse DB online `__ -* PUDL DB (SQL) – `browse DB online `__ -* Census Demographic Profile 1 (SQL) +* Raw FERC Form 1 DB (SQLite) -- `browse DB online `__ +* PUDL DB (SQLite) -- `browse DB online `__ +* Census Demographic Profile 1 (SQLite) Ongoing Development ------------------- @@ -60,45 +60,79 @@ Ongoing Development Development is currently being organized under these epics in the main PUDL repo: +* `Intake SQLite Driver `__ * `EPA CEMS Intake Catalog `__ -* `Prototype SQLite Intake Catalog `__ +* `PUDL Intake Catalog `__ -See the `issues in this repository +See the `issues in the pudl-catalog repository `__ for more detailed tasks. -Planned data distribution system +Usage +----- + +Public data and "requester pays" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -We’re in the process of implementing automated nightly builds of all of our data -products for each development branch with new commits in the main PUDL -repository. This will allow us to do exhaustive integration testing and data -validation on a daily basis. If all of the tests and data validation pass, then -a new version of the data products (SQLite databases and Parquet files) will be -produced, and placed into cloud storage. +The data we're publishing in the PUDL Catalog is publicly accessible and distributed +under the permissive `CC-BY-4.0 `__ +license. Catalyst covers the cost of storing the data in Google cloud storage buckets. +However, there are also fees incurred when data leaves the Google cloud infrastructure. +Depending where you're downloading from, it costs $0.10-0.20 (USD) per GB. -These outputs will be made available via a data catalog on a corresponding -branch in this ``pudl-catalog`` repository. Ingeneral only the catalogs and data -resources corresponding to the ``HEAD`` of development and feature branches will -be available. Releases that are tagged on the ``main`` branch will be retained -long term. +In order to be able to share large amounts of public data without being exposed to large +unexpected bills from Google due to someone maliciously or accidentally downloading a +large volume of data programmatically, we've set the cloud storage to use `requester +pays `__. This means the person +downloading the data is responsible for those (modest) costs instead. Downloading all of +the EPA CEMS, FERC 1, PUDL, and US Census data we're publishing from North America will +cost around $0.75, and it will be cached locally so that it's not downloaded again until +a new version is released. -The idea is that for any released version of PUDL, you should also be able to -install a corresponding data catalog, and know that the software and the data -are compatible. You can also install just the data catalog with minimal -dependencies, and not need to worry about the PUDL software that produced it at -all, if you simply want to access the DBs or Parquet files directly. +To set up a GCP billing project and use it for authentication when accessing the +catalog: -In development, this arrangement will mean that every morning you should have -access to a fully processed set of data products that reflect the branch of code -that you’re working on, rather than the data and code getting progressively -further out of sync as you do development, until you take the time to re-run the -full ETL locally yourself. +* `Create a project on GCP `__; + if this is the first time using GCP, a prompt should appear asking you to choose which + Google account to use for your GCP-related activities. (You should also receive $300 + in initial cloud credits. +* `Create a Cloud Billing account `__ + associated with the project and `enable billing for the project + `__ + through this account. +* `Using Google Cloud IAM `__, + add the **Service Usage Consumer** role to your account, which enables it to make + billed requests on the behalf of the project. +* Install the `gcloud utilities `__ on your + computer. This can be done using ``conda`` (or ``mamba``): + +.. code:: bash + + conda install -c conda-forge google-cloud-sdk + +* Initialize the ``gcloud`` command line interface, logging into the account used to + create the aforementioned project and selecting it as the default project; this will + allow the project to be used for requester pays access through the command line: + +.. code:: bash + + gcloud auth login + gcloud init + +* Finally, use ``gcloud`` to establish application default credentials; this will allow + the project to be used for requester pays access through applications: + +.. code:: bash + + gcloud auth application-default login -Example Usage -------------- +* To test whether your GCP account is set up correctly and authenticated you can run the + following command to list the contents of the cloud storage bucket containing the + intake catalog data: -See the notebook included in this repository for more details. +.. code:: bash + + gsutil ls gs://intake.catalyst.coop Import Intake Catalogs ~~~~~~~~~~~~~~~~~~~~~~ @@ -222,11 +256,7 @@ on that dataframe to actually read the data and return a pandas dataframe: states=["ID", "CO", "TX"], ) epacems_df = ( - pudl_cat.hourly_emissions_epacems( - filters=filters - index=False, - split_row_groups=True, - ) + pudl_cat.hourly_emissions_epacems(filters=filters) .to_dask() .compute() ) @@ -253,6 +283,37 @@ on that dataframe to actually read the data and return a pandas dataframe: 469,4,2019-01-01 10:00:00+00:00,2019,CO,79,298,1.0,204.0,2129.2,126.2 469,4,2019-01-01 11:00:00+00:00,2019,CO,79,298,1.0,204.0,2160.6,128.1 +See the Jupyter notebook included in this repository for more details. + + +Planned data distribution system +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We’re in the process of implementing automated nightly builds of all of our data +products for each development branch with new commits in the main PUDL +repository. This will allow us to do exhaustive integration testing and data +validation on a daily basis. If all of the tests and data validation pass, then +a new version of the data products (SQLite databases and Parquet files) will be +produced, and placed into cloud storage. + +These outputs will be made available via a data catalog on a corresponding +branch in this ``pudl-catalog`` repository. Ingeneral only the catalogs and data +resources corresponding to the ``HEAD`` of development and feature branches will +be available. Releases that are tagged on the ``main`` branch will be retained +long term. + +The idea is that for any released version of PUDL, you should also be able to +install a corresponding data catalog, and know that the software and the data +are compatible. You can also install just the data catalog with minimal +dependencies, and not need to worry about the PUDL software that produced it at +all, if you simply want to access the DBs or Parquet files directly. + +In development, this arrangement will mean that every morning you should have +access to a fully processed set of data products that reflect the branch of code +that you’re working on, rather than the data and code getting progressively +further out of sync as you do development, until you take the time to re-run the +full ETL locally yourself. + Benefits of Intake Catalogs --------------------------- diff --git a/notebooks/pudl-catalog.ipynb b/notebooks/pudl-catalog.ipynb index 22968aa..a4ae35f 100644 --- a/notebooks/pudl-catalog.ipynb +++ b/notebooks/pudl-catalog.ipynb @@ -51,7 +51,8 @@ "from pudl_catalog.helpers import year_state_filter\n", "\n", "TEST_YEARS = [2019, 2020]\n", - "TEST_STATES = [\"ID\", \"CO\", \"TX\"]" + "TEST_STATES = [\"ID\", \"ME\"]\n", + "TEST_FILTERS = year_state_filter(years=TEST_YEARS, states=TEST_STATES)" ] }, { @@ -117,7 +118,7 @@ "source": [ "%%time\n", "# This takes forever and downloads the whole dataset\n", - "pudl_cat.hourly_emissions_epacems.discover()" + "pudl_cat.hourly_emissions_epacems().discover()" ] }, { @@ -136,16 +137,11 @@ "%%time\n", "print(f\"Reading data from {os.getenv('PUDL_INTAKE_PATH')}\")\n", "print(f\"Caching data to {os.getenv('PUDL_INTAKE_CACHE')}\")\n", - "filters = year_state_filter(\n", - " years=TEST_YEARS,\n", - " states=TEST_STATES,\n", - ")\n", - "display(filters)\n", + "display(TEST_FILTERS)\n", "epacems_df = (\n", - " pudl_cat.hourly_emissions_epacems(\n", - " filters=filters,\n", - " )\n", - " .to_dask().compute()\n", + " pudl_cat.hourly_emissions_epacems(filters=TEST_FILTERS)\n", + " .to_dask()\n", + " .compute()\n", ")" ] }, @@ -181,7 +177,10 @@ "outputs": [], "source": [ "%%time\n", - "df1 = pd.read_parquet(f\"{os.environ['PUDL_INTAKE_PATH']}/hourly_emissions_epacems/epacems-2020-ID.parquet\")" + "df1 = pd.read_parquet(\n", + " f\"{os.getenv('PUDL_INTAKE_PATH')}/hourly_emissions_epacems/epacems-2020-ID.parquet\",\n", + " storage_options={\"requester_pays\": True},\n", + ")" ] }, { @@ -191,7 +190,9 @@ "outputs": [], "source": [ "%%time\n", - "df2 = pudl_cat.hourly_emissions_epacems(filters=year_state_filter(years=[2020], states=[\"ID\"])).to_dask().compute()" + "df2 = pudl_cat.hourly_emissions_epacems(\n", + " filters=year_state_filter(years=[2020], states=[\"ID\"])\n", + ").to_dask().compute()" ] }, { @@ -221,7 +222,7 @@ "import fsspec\n", "epacems_pq = pq.read_table(\n", " f\"{os.environ['PUDL_INTAKE_PATH']}/hourly_emissions_epacems/epacems-2020-ID.parquet\",\n", - " filesystem=fsspec.filesystem(\"gs\"),\n", + " filesystem=fsspec.filesystem(\"gs\", requester_pays=True),\n", ")\n", "dtype_dict = {name: dtype for name, dtype in zip(epacems_pq.schema.names, epacems_pq.schema.types)}\n", "pprint(dtype_dict, indent=4, sort_dicts=False)" diff --git a/setup.py b/setup.py index d03fc2e..67c3bfb 100644 --- a/setup.py +++ b/setup.py @@ -45,10 +45,10 @@ zip_safe=False, python_requires=">=3.8,<3.11", install_requires=[ - "gcsfs>=2022,<2023", - "intake>=0.6.5", - "intake_parquet>=0.2.3", - "intake_sql>=0.3.1", + "gcsfs>=2021.7,<2022.3.1", + "intake>=0.6.5,<0.7", + "intake_parquet>=0.2.3,<0.3", + "intake_sql>=0.3.1,<0.4", "msgpack>=1,<2", "pandas>=1.4,<1.5", ], @@ -83,7 +83,7 @@ "pydocstyle>=5.1,<7", # Style guidelines for Python documentation "pytest>=6.2,<8", # Our testing framework "pytest-cov>=2.10,<4.0", # Pytest plugin for working with coverage - "rstcheck>=5,<6", # ReStructuredText linter + "rstcheck[sphinx]>=5,<6", # ReStructuredText linter "tox>=3.20,<4", # Python test environment manager ], }, diff --git a/src/pudl_catalog/pudl_catalog.yaml b/src/pudl_catalog/pudl_catalog.yaml index eacc074..f913d92 100644 --- a/src/pudl_catalog/pudl_catalog.yaml +++ b/src/pudl_catalog/pudl_catalog.yaml @@ -38,9 +38,13 @@ sources: path: "https://creativecommons.org/licenses/by/4.0" args: # These arguments are for dask.dataframe.read_parquet() engine: 'pyarrow' + split_row_groups: true + index: false urlpath: '{{ cache_method }}{{ env(PUDL_INTAKE_PATH) }}/hourly_emissions_epacems.parquet' storage_options: - token: 'anon' # Explicitly use anonymous access. + requester_pays: true + gs: + requester_pays: true simplecache: cache_storage: '{{ env(PUDL_INTAKE_CACHE) }}' @@ -62,9 +66,13 @@ sources: path: "https://creativecommons.org/licenses/by/4.0" args: # These arguments are for dask.dataframe.read_parquet() engine: 'pyarrow' + split_row_groups: true + index: false urlpath: '{{ cache_method }}{{ env(PUDL_INTAKE_PATH) }}/hourly_emissions_epacems/*.parquet' storage_options: - token: 'anon' # Explicitly use anonymous access. + requester_pays: true + gs: + requester_pays: true simplecache: cache_storage: '{{ env(PUDL_INTAKE_CACHE) }}' diff --git a/tests/integration/hourly_emissions_epacems_test.py b/tests/integration/hourly_emissions_epacems_test.py index d3a2812..095478d 100644 --- a/tests/integration/hourly_emissions_epacems_test.py +++ b/tests/integration/hourly_emissions_epacems_test.py @@ -48,7 +48,11 @@ def expected_df() -> pd.DataFrame: partition=False, table_name="hourly_emissions_epacems", ) - return pd.read_parquet(epacems_url, filters=TEST_FILTERS) + return pd.read_parquet( + epacems_url, + filters=TEST_FILTERS, + storage_options={"requester_pays": True}, + ) @pytest.mark.parametrize( @@ -71,7 +75,11 @@ def test_read_parquet( protocol=protocol, partition=partition, table_name="hourly_emissions_epacems" ) start_time = time.time() - df = pd.read_parquet(epacems_url, filters=TEST_FILTERS) + df = pd.read_parquet( + epacems_url, + filters=TEST_FILTERS, + storage_options={"requester_pays": True}, + ) elapsed_time = time.time() - start_time logger.info(f" elapsed time: {elapsed_time:.2f}s") assert_frame_equal(df, expected_df) @@ -103,8 +111,6 @@ def test_intake_catalog( pudl_cat[src]( filters=TEST_FILTERS, cache_method="", - index=False, - split_row_groups=True, ) .to_dask() .compute() diff --git a/tox.ini b/tox.ini index 03148bd..3a679a5 100644 --- a/tox.ini +++ b/tox.ini @@ -14,6 +14,8 @@ passenv = CONDA_PREFIX GITHUB_* HOME + GOOGLE_APPLICATION_CREDENTIALS + covargs = --cov={envsitepackagesdir}/pudl_catalog --cov-append --cov-report=xml covreport = coverage report --sort=cover @@ -32,7 +34,7 @@ commands = description = Check formatting and syntax of RST files. skip_install = false extras = - test + tests commands = rstcheck --config tox.ini --recursive ./ @@ -216,10 +218,7 @@ ignore-path = docs/_build [rstcheck] -report = warning -ignore_roles = - pr, - issue, - user, -ignore directives = - bibliography, +report_level = warning +ignore_roles = pr,issue,user +ignore_messages = (Hyperlink target ".*" is not referenced\.$) +ignore directives = bibliography,todo