Merge branch 'dev' into eia860-2022-final-release

catalyst-cooperative · Nov 13, 2023 · 5251f84 · 5251f84
2 parents d84686a + fa096f2
commit 5251f84
Show file tree

Hide file tree

Showing 36 changed files with 479 additions and 163 deletions.
diff --git a/.github/workflows/build-deploy-pudl.yml b/.github/workflows/build-deploy-pudl.yml
@@ -117,6 +117,7 @@ jobs:
             --container-env DAGSTER_PG_HOST="104.154.182.24" \
             --container-env DAGSTER_PG_DB="dagster-storage" \
             --container-env PUDL_SETTINGS_YML="/home/catalyst/src/pudl/package_data/settings/etl_full.yml" \
+            --container-env FLY_ACCESS_TOKEN=${{ secrets.FLY_ACCESS_TOKEN }} \
 
       # Start the VM
       - name: Start the deploy-pudl-vm

diff --git a/.github/workflows/update-lockfile.yml b/.github/workflows/update-lockfile.yml
@@ -0,0 +1,57 @@
+---
+name: update-lockfile
+
+on:
+  workflow_dispatch:
+  # schedule:
+  # At 5:28am UTC Monday and Thursday
+  # - cron: 28 5 * * MON,THU
+
+jobs:
+  conda-lock:
+    # Don't run scheduled job on forks.
+    if: (github.event_name == 'schedule' && github.repository == 'catalyst-cooperative/pudl') || (github.event_name != 'schedule')
+    defaults:
+      run:
+        # Ensure the environment is activated
+        # <https://github.com/mamba-org/provision-with-micromamba#important>
+        shell: bash -l {0}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install Micromamba
+        uses: mamba-org/setup-micromamba@v1
+        with:
+          environment-file: environments/conda-lock.yml
+          environment-name: pudl-dev
+
+      - name: Install pudl from branch
+        run: pip install --editable "./[dev,docs,test,datasette]"
+
+      - name: Run conda-lock to recreate lockfile from scratch
+        run: |
+          rm environments/conda-lock.yml
+          conda-lock \
+              --file=environments/dev-environment.yml \
+              --file=pyproject.toml \
+              --lockfile=environments/conda-lock.yml
+      - name: Open a pull request
+        uses: peter-evans/create-pull-request@v5
+        with:
+          # # The default GITHUB_TOKEN doesn't allow other workflows to trigger.
+          # # Thus if there are tests to be run, they won't be run. For more info,
+          # # see the note under
+          # # <https://github.com/peter-evans/create-pull-request#action-inputs>.
+          # # One possible workaround is to specify a Personal Access Token (PAT).
+          # # This PAT should have read-write permissions for "Pull Requests"
+          # # and read-write permissions for "Contents".
+          # token: ${{ secrets.GH_PAT_FOR_PR }}
+          commit-message: Update lockfile
+          title: Update Lockfile
+          body: >
+            This pull request relocks the dependencies with conda-lock.
+            It is triggered by [update-lockfile](https://github.com/catalyst-cooperative/pudl/blob/main/.github/workflows/update-lockfile.yml).
+          branch: update-lockfile
+          labels: dependencies, conda-lock
+          reviewers: zaneselvans
+          delete-branch: true
diff --git a/.gitignore b/.gitignore
@@ -38,3 +38,9 @@ notebooks/*.tgz
 terraform/.terraform/*
 .env
 .hypothesis/
+
+# generated by datasette/publish.py fresh for every deploy - we shouldn't track changes.
+devtools/datasette/fly/Dockerfile
+devtools/datasette/fly/inspect-data.json
+devtools/datasette/fly/metadata.yml
+devtools/datasette/fly/all_dbs.tar.zst
diff --git a/devtools/datasette/fly/fly.toml b/devtools/datasette/fly/fly.toml
@@ -0,0 +1,34 @@
+# fly.toml app configuration file generated for catalyst-coop-pudl on 2023-11-03T15:31:15-04:00
+#
+# See https://fly.io/docs/reference/configuration/ for information about how to use this file.
+#
+app = "catalyst-coop-pudl"
+primary_region = "bos"
+
+[[mounts]]
+  destination = "/data"
+  source = "datasette"
+
+[[services]]
+  internal_port = 8080
+  protocol = "tcp"
+
+  [services.concurrency]
+    hard_limit = 25
+    soft_limit = 20
+
+  [[services.ports]]
+    handlers = ["http"]
+    port = 80
+
+  [[services.ports]]
+    handlers = ["tls", "http"]
+    port = 443
+
+  [[services.tcp_checks]]
+    grace_period = "1m"
+    interval = 10000
+    timeout = 2000
+
+[deploy]
+wait_timeout = "15m"
diff --git a/devtools/datasette/fly/run.sh b/devtools/datasette/fly/run.sh
@@ -0,0 +1,10 @@
+#! /usr/bin/env bash
+set -eux
+
+shopt -s nullglob
+
+find /data/ -name '*.sqlite' -delete
+mv all_dbs.tar.zst /data
+zstd -f -d /data/all_dbs.tar.zst -o /data/all_dbs.tar
+tar -xf /data/all_dbs.tar --directory /data
+datasette serve --host 0.0.0.0 /data/*.sqlite --cors --inspect-file inspect-data.json --metadata metadata.yml --setting sql_time_limit_ms 5000 --port $PORT
diff --git a/devtools/datasette/publish.py b/devtools/datasette/publish.py
@@ -0,0 +1,122 @@
+"""Publish the datasette to fly.io.
+
+We use custom logic here because the datasette-publish-fly plugin bakes the
+uncompressed databases into the image, which makes the image too large.
+
+We compress the databases before baking them into the image. Then we decompress
+them at runtime to a Fly volume mounted at /data. This avoids a long download
+at startup, and allows us stay within the Fly.io 8GB image size limit.
+
+The volume handling is done manually outside of this publish.py script - it
+should be terraformed at some point.
+
+Some static fly.io deployment-related files live in ./fly:
+* fly.toml - service configuration
+* run.sh - service entrypoint
+
+Apart from that: the Dockerfile and dataset-specific
+metadata.yml/inspect-data.json are generated by this script.
+"""
+
+import json
+import logging
+import secrets
+from pathlib import Path
+from subprocess import check_call, check_output
+
+from pudl.metadata.classes import DatasetteMetadata
+from pudl.workspace.setup import PudlPaths
+
+logging.basicConfig(format="%(asctime)s %(message)s", level=logging.INFO)
+
+DOCKERFILE_TEMPLATE = """
+FROM python:3.11.0-slim-bullseye
+COPY . /app
+WORKDIR /app
+
+RUN apt-get update
+RUN apt-get install -y zstd
+
+ENV DATASETTE_SECRET '{datasette_secret}'
+RUN pip install -U datasette datasette-cluster-map datasette-vega datasette-block-robots
+ENV PORT 8080
+EXPOSE 8080
+
+CMD ["./run.sh"]
+"""
+
+
+def make_dockerfile():
+    """Write a dockerfile from template, to use in fly deploy.
+
+    We write this from template so we can generate a datasette secret. This way
+    we don't have to manage secrets at all.
+    """
+    datasette_secret = secrets.token_hex(16)
+    return DOCKERFILE_TEMPLATE.format(datasette_secret=datasette_secret)
+
+
+def inspect_data(datasets, pudl_out):
+    """Pre-inspect databases to generate some metadata for Datasette.
+
+    This is done in the image build process in datasette-publish-fly, but since
+    we don't have access to the databases in the build process we have to
+    inspect before building the Docker image.
+    """
+    inspect_output = json.loads(
+        check_output(
+            [  # noqa: S603
+                "datasette",
+                "inspect",
+            ]
+            + [str(pudl_out / ds) for ds in datasets]
+        )
+    )
+
+    for dataset in inspect_output:
+        name = Path(inspect_output[dataset]["file"]).name
+        new_filepath = Path("/data") / name
+        inspect_output[dataset]["file"] = str(new_filepath)
+    return inspect_output
+
+
+def metadata(pudl_out) -> str:
+    """Return human-readable metadata for Datasette."""
+    return DatasetteMetadata.from_data_source_ids(pudl_out).to_yaml()
+
+
+def main():
+    """Generate deployment files and run the deploy."""
+    fly_dir = Path(__file__).parent.absolute() / "fly"
+    docker_path = fly_dir / "Dockerfile"
+    inspect_path = fly_dir / "inspect-data.json"
+    metadata_path = fly_dir / "metadata.yml"
+
+    pudl_out = PudlPaths().pudl_output
+    datasets = [str(p.name) for p in pudl_out.glob("*.sqlite")]
+    logging.info(f"Inspecting DBs for datasette: {datasets}...")
+    inspect_output = inspect_data(datasets, pudl_out)
+    with inspect_path.open("w") as f:
+        f.write(json.dumps(inspect_output))
+
+    logging.info("Writing metadata...")
+    with metadata_path.open("w") as f:
+        f.write(metadata(pudl_out))
+
+    logging.info("Writing Dockerfile...")
+    with docker_path.open("w") as f:
+        f.write(make_dockerfile())
+
+    logging.info(f"Compressing {datasets} and putting into docker context...")
+    check_call(
+        ["tar", "-a", "-czvf", fly_dir / "all_dbs.tar.zst"] + datasets,  # noqa: S603
+        cwd=pudl_out,
+    )
+
+    logging.info("Running fly deploy...")
+    check_call(["/usr/bin/env", "flyctl", "deploy"], cwd=fly_dir)  # noqa: S603
+    logging.info("Deploy finished!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/devtools/datasette/publish.sh b/devtools/datasette/publish.sh
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,5 +1,7 @@
 FROM condaforge/mambaforge:23.3.1-1
 
+SHELL [ "/bin/bash", "-exo", "pipefail", "-c" ]
+
 # Install curl and js
 # awscli requires unzip, less, groff and mandoc
 # hadolint ignore=DL3008
@@ -24,6 +26,10 @@ ENV CONTAINER_HOME=/home/catalyst
 USER catalyst
 WORKDIR ${CONTAINER_HOME}
 
+# Install flyctl
+RUN curl -L https://fly.io/install.sh | sh
+ENV PATH="${CONTAINER_HOME}/.fly/bin:$PATH"
+
 ENV CONDA_PREFIX=${CONTAINER_HOME}/env
 ENV PUDL_REPO=${CONTAINER_HOME}/pudl
 ENV CONDA_RUN="conda run --no-capture-output --prefix ${CONDA_PREFIX}"

diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -12,6 +12,7 @@ services:
     environment:
       - API_KEY_EIA
       - GCP_BILLING_PROJECT
+      - FLY_ACCESS_TOKEN
     env_file:
       - .env
     build:

diff --git a/docker/gcp_pudl_etl.sh b/docker/gcp_pudl_etl.sh
@@ -85,20 +85,24 @@ function notify_slack() {
 # 2>&1 redirects stderr to stdout.
 run_pudl_etl 2>&1 | tee $LOGFILE
 
-# Notify slack if the etl succeeded.
+# if pipeline is successful, distribute + publish datasette
 if [[ ${PIPESTATUS[0]} == 0 ]]; then
-    notify_slack "success"
-
     # Dump outputs to s3 bucket if branch is dev or build was triggered by a tag
     if [ $GITHUB_ACTION_TRIGGER = "push" ] || [ $GITHUB_REF = "dev" ]; then
         copy_outputs_to_distribution_bucket
     fi
 
     # Deploy the updated data to datasette
     if [ $GITHUB_REF = "dev" ]; then
-        gcloud config set run/region us-central1
-        source ~/devtools/datasette/publish.sh
+        python ~/devtools/datasette/publish.py 2>&1 | tee -a $LOGFILE
     fi
+fi
+
+# Notify slack about entire pipeline's success or failure;
+# PIPESTATUS[0] either refers to the failed ETL run or the last distribution
+# task that was run above
+if [[ ${PIPESTATUS[0]} == 0 ]]; then
+    notify_slack "success"
 else
     notify_slack "failure"
 fi

diff --git a/notebooks/work-in-progress/CEMS_by_utility.ipynb b/notebooks/work-in-progress/CEMS_by_utility.ipynb
@@ -47,7 +47,7 @@
     "from pudl.workspace.setup import PudlPaths\n",
     "\n",
     "\n",
-    "ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
+    "ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
     "\n",
     "pudl_engine = sa.create_engine(PudlPaths().pudl_db())\n",
     "#display(pudl_engine)\n",

diff --git a/notebooks/work-in-progress/better-heatrates.ipynb b/notebooks/work-in-progress/better-heatrates.ipynb
@@ -324,7 +324,7 @@
     "from pudl.workspace.setup import PudlPaths\n",
     "\n",
     "# TODO(janrous): provide property for accessing ferc db?\n",
-    "ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
+    "ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
     "pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
     "\n",
     "API_KEY_EIA = os.environ[\"API_KEY_EIA\"]\n",

diff --git a/notebooks/work-in-progress/ferc714-output.ipynb b/notebooks/work-in-progress/ferc714-output.ipynb
@@ -142,7 +142,7 @@
    "source": [
     "from pudl.workspace.setup import PudlPaths\n",
     "\n",
-    "ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
+    "ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
     "display(ferc1_engine)\n",
     "\n",
     "pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",

diff --git a/notebooks/work-in-progress/jupyterhub-test.ipynb b/notebooks/work-in-progress/jupyterhub-test.ipynb
@@ -51,7 +51,7 @@
    "source": [
     "from pudl.workspace.setup import PudlPaths\n",
     "\n",
-    "ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
+    "ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
     "pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
     "pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine=pudl_engine)"
    ]

diff --git a/notebooks/work-in-progress/state-demand.ipynb b/notebooks/work-in-progress/state-demand.ipynb
@@ -113,7 +113,7 @@
     "#HARVEST_ACCOUNT_ID = os.environ[\"HARVEST_ACCOUNT_ID\"]\n",
     "\n",
     "from pudl.workspace.setup import PudlPaths\n",
-    "ferc1_engine = sa.create_engine(PudlPaths().sqlite_db(\"ferc1\"))\n",
+    "ferc1_engine = sa.create_engine(PudlPaths().sqlite_db_uri(\"ferc1\"))\n",
     "pudl_engine = sa.create_engine(PudlPaths().pudl_db)\n",
     "pudl_out = pudl.output.pudltabl.PudlTabl(pudl_engine=pudl_engine)"
    ]

diff --git a/pyproject.toml b/pyproject.toml
@@ -100,7 +100,6 @@ keywords = [
 metadata_to_rst = "pudl.convert.metadata_to_rst:main"
 epacems_to_parquet = "pudl.convert.epacems_to_parquet:main"
 ferc_to_sqlite = "pudl.ferc_to_sqlite.cli:main"
-datasette_metadata_to_yml = "pudl.convert.datasette_metadata_to_yml:main"
 pudl_datastore = "pudl.workspace.datastore:main"
 pudl_etl = "pudl.cli.etl:main"
 pudl_setup = "pudl.workspace.setup_cli:main"