diff --git a/.github/workflows/test_update_api_database.yml b/.github/workflows/test_update_api_database.yml new file mode 100644 index 0000000..7e04a86 --- /dev/null +++ b/.github/workflows/test_update_api_database.yml @@ -0,0 +1,96 @@ +on: [push] + +jobs: + etl: + runs-on: ubuntu-latest + services: + postgres: + image: postgis/postgis:15-3.4-alpine + env: + POSTGRES_PASSWORD: postgres + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + # Maps tcp port 5432 on service container to the host + - 5432:5432 + steps: + - name: check out repo code + uses: actions/checkout@v4 + - name: Load Secrets + uses: 1password/load-secrets-action@v1 + with: + export-env: true + env: + OP_SERVICE_ACCOUNT_TOKEN: ${{ secrets.OP_SERVICE_ACCOUNT_TOKEN }} + DO_SPACES_ENDPOINT: "op://AE Data Flow/Digital Ocean - S3 file storage/DO_SPACES_ENDPOINT" + DO_SPACES_ACCESS_KEY: "op://AE Data Flow/Digital Ocean - S3 file storage/DO_SPACES_ACCESS_KEY" + DO_SPACES_SECRET_KEY: "op://AE Data Flow/Digital Ocean - S3 file storage/DO_SPACES_SECRET_KEY" + DO_SPACES_BUCKET_DISTRIBUTIONS: "op://AE Data Flow/Digital Ocean - S3 file storage/DO_SPACES_BUCKET_DISTRIBUTIONS" + DO_ZONING_API_DB_HOST: "op://AE Data Flow/Digital Ocean DB Cluster - Zoning API/host" + DO_ZONING_API_DB_PORT: "op://AE Data Flow/Digital Ocean DB Cluster - Zoning API/port" + DO_ZONING_API_DB_USERNAME_DEV: "op://AE Data Flow/Digital Ocean DB Cluster - Zoning API dev/username" + DO_ZONING_API_DB_PASSWORD_DEV: "op://AE Data Flow/Digital Ocean DB Cluster - Zoning API dev/password" + DO_ZONING_API_DB_DATABASE_DEV: "op://AE Data Flow/Digital Ocean DB Cluster - Zoning API dev/database" + - name: Set .env file + run: | + echo "BUILD_ENGINE_HOST=127.0.0.1" >> .env + echo "BUILD_ENGINE_PORT=5432" >> .env + echo "BUILD_ENGINE_USER=postgres" >> .env + echo "BUILD_ENGINE_PASSWORD=postgres" >> .env + echo "BUILD_ENGINE_DB=postgres" >> .env + echo "DO_SPACES_ENDPOINT=$DO_SPACES_ENDPOINT" >> .env + echo "DO_SPACES_ACCESS_KEY=$DO_SPACES_ACCESS_KEY" >> .env + echo "DO_SPACES_SECRET_KEY=$DO_SPACES_SECRET_KEY" >> .env + echo "DO_SPACES_BUCKET_DISTRIBUTIONS=$DO_SPACES_BUCKET_DISTRIBUTIONS" >> .env + echo "ZONING_API_HOST=$DO_ZONING_API_DB_HOST" >> .env + echo "ZONING_API_PORT=$DO_ZONING_API_DB_PORT" >> .env + echo "ZONING_API_USER=$DO_ZONING_API_DB_USERNAME_DEV" >> .env + echo "ZONING_API_PASSWORD=$DO_ZONING_API_DB_PASSWORD_DEV" >> .env + echo "ZONING_API_DB=$DO_ZONING_API_DB_DATABASE_DEV" >> .env + + - name: Install prerequisite packages + run: | + sudo apt-get update + sudo apt-get install -y wget + sudo apt-get install -y git + + - name: Setup PostgreSQL + uses: tj-actions/install-postgresql@v3 + with: + postgresql-version: 15 + + - name: Check postgres install + run: pg_dump --version + + - name: Install minio client + run: | + sudo wget https://dl.min.io/client/mc/release/linux-amd64/mc + sudo chmod +x mc + sudo mv mc /usr/local/bin + + - name: Setup python + uses: actions/setup-python@v5 + with: + python-version-file: ".python-version" + + - name: Install python dependencies + run: pip install -r requirements.txt + + - name: Install dbt dependencies + run: dbt deps + + - name: Download + run: ./bash/download.sh + + - name: Import + run: ./bash/import.sh + + - name: Transform + run: ./bash/transform.sh + + - name: Export + run: ./bash/export.sh + \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..21de767 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,48 @@ +FROM ubuntu:latest + +RUN apt-get update + +# RUN apt install -y wget gpg gnupg2 software-properties-common apt-transport-https lsb-release ca-certificates +RUN apt-get install -y wget +RUN apt-get install -y software-properties-common + +# psql from postgres-client +RUN sh -c 'echo "deb https://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' +RUN wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - +RUN apt-get update +RUN apt-get install -y postgresql-client-15 + + +# minio client +RUN wget https://dl.min.io/client/mc/release/linux-amd64/mc +RUN chmod +x mc +RUN mv mc /usr/local/bin + +# python +COPY requirements.txt /requirements.txt +RUN apt-get install -y python3 python3-pip +RUN pip install -r requirements.txt + +# dbt +## config +COPY dbt_project.yml /dbt_project.yml +COPY package-lock.yml /package-lock.yml +COPY packages.yml /packages.yml +COPY profiles.yml /profiles.yml +## install +RUN apt-get install -y git +RUN dbt deps +## tests +COPY tests /tests + +# etl +## scripts +COPY bash ./bash +## commands +COPY sql /sql +## local source files +COPY borough.csv /borough.csv +COPY land_use.csv /land_use.csv +COPY zoning_district_class.csv /zoning_district_class.csv + +CMD ["sleep", "infinity"] diff --git a/README.md b/README.md index 4e65849..8463670 100644 --- a/README.md +++ b/README.md @@ -5,25 +5,23 @@ This is the primary repository for the data pipelines of the Application Enginee These pipelines are used to populate the databases used by our APIs and are called "data flows". ## Design +For all AE data flows, there is an ephemeral database within a docker-ized runner -For all AE data flows, there is one database cluster with a `staging` and a `prod` database. There are also `dev` databases. These are called data flow databases. - -For each API, there is a database cluster with a `staging` and a `prod` database. The only tables in those databases are those that an API uses. These are called API databases. +For each API, there is a database cluster with a `data-qa` and a `prod` database. The only tables in those databases are those that an API uses. These are called API databases. For each API and the relevant databases, this is the approach to updating data: -1. Load source data into the data flow database +1. Load source data into the data flow ephemeral database 2. Create tables that are identical in structure to the API database tables 3. Replace the rows in the API database tables -These steps are first performed on the `staging` sets of databases. When that process has succeeded and the API's use of it has passed QA, the same process is performed on the `prod` set of databases. +The exact data flow steps are refined while working in a `local` docker environment. After the steps are stable, they are merged into `main`. From there, they are run first against a `data-qa` API database from within the `data-flow` GitHub action. After receiving quality checks, the `data-flow` GitHub Action is targeted against the `prod` API database. This is a more granular description of those steps: - 1. Download CSV files from Digital Ocean file storage 2. Copy CSV files into source data tables 3. Test source data tables -4. Create API tables in the data flow database +4. Create API tables in the data flow ephemeral database 5. Populate the API tables in data flow database 6. Replace rows in API tables in the API database @@ -41,82 +39,15 @@ We use a github action to perform API database updates. We have three [environments](https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment) to configure the databases and credentials used for an API database update. -The `dev` environment can used on any branch. The `staging` and `production` environments can only be used on the `main` branch. +The `dev` environment can used on any branch. The `data-qa` and `production` environments can only be used on the `main` branch. When an action attempts to use the `production` environment, specific people or teams specified in this repo's settings must approve the action run's access of environment. ## Local setup -### Setup MiniO for S3 file transfers - > [!NOTE] -> These instructions are for local setup on macOS. - -For non-public files like our CSVs in `/edm/distribution/`, we can use [minio](https://github.com/minio/minio) for authenticated file transfers. - -#### Install - -```bash -brew install minio/stable/mc -``` - -#### Add DO Spaces to the `mc` configuration - -```bash -mc alias set spaces $DO_SPACES_ENDPOINT $DO_SPACES_ACCESS_KEY $DO_SPACES_SECRET_KEY -``` - -We use `spaces` here but you can name the alias anything. When you run `mc config host list` you should see the newly added host with credentials from your `.env`. - -### Setup python virtual environment - -> [!NOTE] -> These instructions are for use of [pyenv](https://github.com/pyenv/pyenv) to manage python virtual environments. See [these instructions](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) to install it. -> -> If you are using a different approach like [venv](https://docs.python.org/3/library/venv.html) or [virtualenv](https://virtualenv.pypa.io/en/latest/), follow comparable instructions in the relevant docs. - -The `.python-version` file defines which version of python this project uses. - -#### Install - -```bash -brew install pyenv -brew install pyenv-virtualenv -``` - -#### Create a virtual environment named `venv_ae_data_flow` - -```bash -pyenv virtualenv venv_ae_data_flow -pyenv virtualenvs -``` - -#### Activate `venv_ae_data_flow` in the current terminal - -```bash -pyenv activate venv_ae_data_flow -pyenv version -``` - -#### Install dependencies - -```bash -python3 -m pip install --force-reinstall -r requirements.txt -pip list -dbt deps -``` - -### Setup postgres - -We use `postgres` version 15 in order to use the `psql` CLI. - -```bash -brew install postgresql@15 -# Restart the terminal -psql --version -``` - -## Local usage +> These instructions depend on docker and docker compose +> If you need to install docker compose, follow [these instructions](https://docs.docker.com/compose/install/). ### Set environment variables @@ -124,29 +55,24 @@ Create a file called `.env` in the root folder of the project and copy the conte Next, fill in the blank values. -> [!IMPORTANT] -> To use a local database, `sample_local.env` likely has the environment variable values you need. -> -> To use a deployed database in Digital Ocean, the values you need can be found in the AE 1password vault. +### Run the local zoning api database +The `data-flow` steps are run against the `zoning-api` database. Locally, this relies on these two containers running on the same network. The zoning-api creates the network, which the data-flow then joins. +Before continuing with the `data-flow` setup, follow the steps within `nycplanning/ae-zoning-api` to get its database running in a container. -### Run local database with docker compose - -Next, use [docker compose](https://docs.docker.com/compose/) to stand up a local PostGIS database. +### Run data-flow local database with docker compose ```bash ./bash/utils/setup_local_db.sh ``` -If you need to install docker compose, follow [these instructions](https://docs.docker.com/compose/install/). - -### Run each step +### Run each step to complete the data flow ```bash -./bash/download.sh -./bash/import.sh -./bash/transform.sh -./bash/export.sh -./bash/update_api_db.sh +docker compose exec data-flow bash ./bash/download.sh +docker compose exec data-flow bash ./bash/import.sh +docker compose exec data-flow bash ./bash/transform.sh +docker compose exec data-flow bash ./bash/export.sh +docker compose exec data-flow bash ./bash/update_api_db.sh ``` If you receive an error, make sure the script has the correct permissions: diff --git a/bash/download.sh b/bash/download.sh index 0fe8543..2797d83 100755 --- a/bash/download.sh +++ b/bash/download.sh @@ -11,6 +11,9 @@ source $ROOT_DIR/bash/utils/set_environment_variables.sh # Setting Environmental Variables set_envars +# set alias +mc alias set spaces $DO_SPACES_ENDPOINT $DO_SPACES_ACCESS_KEY $DO_SPACES_SECRET_KEY + # Download CSV files from Digital Ocean file storage DATA_DIRECTORY=.data/ mkdir -p ${DATA_DIRECTORY} && ( diff --git a/bash/utils/setup_local_db.sh b/bash/utils/setup_local_db.sh deleted file mode 100755 index b978a39..0000000 --- a/bash/utils/setup_local_db.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -# Exit when any command fails -set -e - -UTILS_DIR=$(dirname "$(readlink -f "${BASH_SOURCE[0]}")") -ROOT_DIR=$(dirname ${UTILS_DIR})/.. - -source ${ROOT_DIR}/bash/utils/set_environment_variables.sh - -# Setting Environmental Variables -set_envars - -# Delete any artifacts of a local Data Flow database -running_container_id=$(docker ps --all --quiet --filter name=${BUILD_ENGINE_CONTAINER_NAME}) -if [ -n "$running_container_id" ] -then - echo "Stoping and deleting docker container ID ${running_container_id} ..." - docker stop $running_container_id - docker rm --force $running_container_id -else - echo "Container ${BUILD_ENGINE_CONTAINER_NAME} not running" -fi - -echo "Deleting database volume directory /db-volume ..." -rm -rf ${ROOT_DIR}/db-volume - -# Create local Data Flow database -echo "Creating docker container ${BUILD_ENGINE_CONTAINER_NAME} ..." -docker compose up --detach diff --git a/compose.yml b/compose.yml index 74fe055..44bf33e 100644 --- a/compose.yml +++ b/compose.yml @@ -1,13 +1,34 @@ services: db: - container_name: ${BUILD_ENGINE_CONTAINER_NAME} build: - context: ./db + context: db/. environment: - POSTGRES_USER=${BUILD_ENGINE_USER} - POSTGRES_PASSWORD=${BUILD_ENGINE_PASSWORD} - POSTGRES_DB=${BUILD_ENGINE_DB} + networks: + - data ports: - "8001:5432" + runner: + build: + context: . + env_file: + - .env + networks: + - data + data-flow: + build: + context: . + env_file: + - .env + networks: + - data volumes: - - ./db-volume:/var/lib/postgresql/data + - ./tests:/tests + - ./bash:/bash + - ./sql:/sql +networks: + data: + name: ae-zoning-api_data + external: true diff --git a/db/Dockerfile b/db/Dockerfile index fcce1e9..60d6b7a 100644 --- a/db/Dockerfile +++ b/db/Dockerfile @@ -1,4 +1,4 @@ -FROM postgis/postgis:15-3.4 +FROM postgres:15-bookworm RUN apt update RUN apt install -y postgresql-15-postgis-3 diff --git a/diagrams/infrastructure_api_data_flow.drawio.png b/diagrams/infrastructure_api_data_flow.drawio.png index ca812c2..129111a 100644 Binary files a/diagrams/infrastructure_api_data_flow.drawio.png and b/diagrams/infrastructure_api_data_flow.drawio.png differ diff --git a/sample.env b/sample.env index bade0f2..bfa400d 100644 --- a/sample.env +++ b/sample.env @@ -1,17 +1,16 @@ -BUILD_ENGINE_CONTAINER_NAME=ae-data-flow-database -BUILD_ENGINE_HOST= -BUILD_ENGINE_PORT= -BUILD_ENGINE_USER= -BUILD_ENGINE_PASSWORD= -BUILD_ENGINE_DB= +BUILD_ENGINE_HOST=ae-data-flow-db-1 +BUILD_ENGINE_PORT=5432 +BUILD_ENGINE_USER=postgres +BUILD_ENGINE_PASSWORD=postgres +BUILD_ENGINE_DB=data-flow DO_SPACES_ENDPOINT= DO_SPACES_ACCESS_KEY= DO_SPACES_SECRET_KEY= DO_SPACES_BUCKET_DISTRIBUTIONS=edm-distributions -ZONING_API_HOST= -ZONING_API_PORT= -ZONING_API_USER= -ZONING_API_PASSWORD= -ZONING_API_DB= \ No newline at end of file +ZONING_API_HOST=ae-zoning-api-db-1 +ZONING_API_PORT=5432 +ZONING_API_USER=postgres +ZONING_API_PASSWORD=postgres +ZONING_API_DB=zoning diff --git a/sample_local.env b/sample_local.env deleted file mode 100644 index 59b6c7a..0000000 --- a/sample_local.env +++ /dev/null @@ -1,17 +0,0 @@ -BUILD_ENGINE_CONTAINER_NAME=ae-data-flow-database -BUILD_ENGINE_HOST=localhost -BUILD_ENGINE_PORT=8001 -BUILD_ENGINE_USER=postgres -BUILD_ENGINE_PASSWORD=postgres -BUILD_ENGINE_DB=data-flow - -DO_SPACES_ENDPOINT= -DO_SPACES_ACCESS_KEY= -DO_SPACES_SECRET_KEY= -DO_SPACES_BUCKET_DISTRIBUTIONS=edm-distributions - -ZONING_API_HOST=localhost -ZONING_API_PORT=8010 -ZONING_API_USER=postgres -ZONING_API_PASSWORD=postgres -ZONING_API_DB=zoning diff --git a/sql/load_sources.sql b/sql/load_sources.sql index ee6d604..bf1288f 100644 --- a/sql/load_sources.sql +++ b/sql/load_sources.sql @@ -1,3 +1,5 @@ +CREATE EXTENSION IF NOT EXISTS postgis; + DROP TABLE IF EXISTS source_pluto; DROP INDEX IF EXISTS pluto_geom_idx; CREATE TABLE IF NOT EXISTS "source_pluto" (