diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000..37754a53 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,57 @@ +name: Build kernel images + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + base-linting: + name: base-linting + runs-on: ubuntu-22.04 + + steps: + - name: Checkout Code + uses: actions/checkout@v3 + + - name: Lint Dockerfile, Shell scripts, YAML + uses: github/super-linter@v4 + env: + DEFAULT_BRANCH: master + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + # Linters to enable + VALIDATE_BASH: true + VALIDATE_BASH_EXEC: true + VALIDATE_DOCKERFILE_HADOLINT: true + VALIDATE_YAML: true + + build_python_kernel: + permissions: + id-token: write + contents: read + packages: write + actions: write + uses: ./.github/workflows/reusable-docker-build.yml + strategy: + matrix: + # Must be a supported version by jupyter/datascience-notebook + # https://hub.docker.com/r/jupyter/datascience-notebook/tags?page=1&name=python- + version: [ "3.9.13", "3.8.13" ] + secrets: inherit + with: + dockerfile: ./kernels/python/Dockerfile + context: ./kernels/python + images: | + ghcr.io/${{ github.repository }}/python + tags: | + type=ref,event=branch,prefix=${{ matrix.version }} + type=ref,event=pr,prefix=${{ matrix.version }} + type=sha,format=long,prefix=${{ matrix.version }} + type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', 'main') }},prefix=${{ matrix.version }} + build_args: | + PYTHON_VERSION=${{ matrix.version }} + platforms: "linux/amd64" diff --git a/.github/workflows/reusable-docker-build.yml b/.github/workflows/reusable-docker-build.yml new file mode 100644 index 00000000..5958de5b --- /dev/null +++ b/.github/workflows/reusable-docker-build.yml @@ -0,0 +1,142 @@ +name: docker + +on: + workflow_call: + inputs: + dockerfile: + description: "Path to the Dockerfile to build" + type: string + default: Dockerfile + context: + description: "The context for Docker build" + type: string + default: "." + platforms: + description: "Comma separate list of platforms to build on" + type: string + required: false + default: "linux/amd64,linux/arm64" + images: + description: "The image names that we want to build" + type: string + required: true + tags: + description: "The various tags to be attached to the built image" + type: string + required: false + default: "" + labels: + description: "The various labels to attach to the built image" + type: string + required: false + default: | + org.opencontainers.image.url=https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} + org.opencontainers.image.vendor=Noteable + org.opencontainers.image.version=${{ github.ref }} + target: + description: "Sets the target stage to build" + type: string + required: false + build_args: + description: "Additional build args to pass to the Docker build" + type: string + required: false + default: "" + secrets: + # We login to Dockerhub to prevent rate limiting issues when pulling images + # https://docs.docker.com/docker-hub/download-rate-limit/ + DOCKERHUB_USER: + required: true + DOCKERHUB_PASSWORD: + required: true + +jobs: + build: + permissions: + id-token: write + contents: read + packages: write + + if: | + github.event_name == 'push' || + (github.event_name == 'pull_request' && github.event.pull_request.state == 'open') + runs-on: ubuntu-22.04 + steps: + - name: Checkout the code + uses: actions/checkout@v3 + + - name: Copy common files + run: make copy-common-files + - name: Log in to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USER }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + + - name: Log in to the Container registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + with: + version: v0.10.1 + + # Note: The outputs in github action will show duplicate labels being generated for the meta outputs. + # When the Docker engine builds, it will only take the later values, and our custom labels get added + # at the end. https://github.com/docker/metadata-action/issues/125 + - name: Docker metadata for labels and tags + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ inputs.images }} + tags: ${{ inputs.tags }} + labels: ${{ inputs.labels }} + + - name: Build and push + uses: docker/build-push-action@v3 + with: + platforms: ${{ inputs.platforms }} + context: ${{ inputs.context }} + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + target: ${{ inputs.target }} + cache-from: type=gha + cache-to: type=gha,mode=max + build-args: ${{ inputs.build_args }} + + clear_cache: + permissions: + contents: read + actions: write + # If the PR is closed (or merged), we want to clear the cache + if: ${{ github.event_name == 'pull_request' && github.event.pull_request.state == 'closed' }} + runs-on: ubuntu-latest + steps: + - name: Check out code + uses: actions/checkout@v3 + + - name: Cleanup + run: | + gh extension install actions/gh-actions-cache + + REPO=${{ github.repository }} + BRANCH=${{ github.ref }} + + echo "Fetching list of cache key" + cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH | cut -f 1 ) + + ## Setting this to not fail the workflow while deleting cache keys. + set +e + echo "Deleting caches..." + for cacheKey in $cacheKeysForPR + do + gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm + done + echo "Done" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..25e95419 --- /dev/null +++ b/Makefile @@ -0,0 +1,6 @@ +copy-common-files: + cp requirements.txt kernels/python/ + cp ipython_config.py kernels/python/ + cp secrets_helper.py kernels/python/ + cp git_credential_helper.py kernels/python/ + cp git-wrapper.sh kernels/python/ diff --git a/git-wrapper.sh b/git-wrapper.sh new file mode 100644 index 00000000..4e2a505e --- /dev/null +++ b/git-wrapper.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +# This script wraps git to only allow certain commands to be run. +# We mainly want to prevent users from getting into unknown states by checking out other branches, etc. + +# Allowed command list +allowed_commands=( "commit" "pull" "push" "status" "diff" "add" "fetch" "log" "version" ) + +# Check if the command is allowed +# shellcheck disable=SC2076 +if [[ ! " ${allowed_commands[*]} " =~ " ${1} " ]]; then + echo "That git command is not allowed, contact support@noteable.io if you think this is a mistake." + exit 1 +fi + +# Otherwise pass through to git at /usr/bin/git +exec /usr/bin/git "$@" diff --git a/git_credential_helper.py b/git_credential_helper.py new file mode 100644 index 00000000..d989a54f --- /dev/null +++ b/git_credential_helper.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +""" +This script is used as a Git credential helper https://git-scm.com/docs/git-credential. +We iterate through all the git credential secrets on the file system and return the first one that matches the requested URL. +If no match is found, we return an empty response. +An empty response will cause Git to use the next credential helper in the list, or prompt the user for credentials. +To test this script: +$ cat > /tmp/demo.git-cred < dict: + """Parse the input from Git into a dictionary.""" + return dict(line.split("=", 1) for line in input_.splitlines()) + + +def format_output(data: dict) -> str: + """Format the output to Git.""" + return "\n".join(f"{key}={value}" for key, value in data.items()) + + +def find_secret(input_data: dict) -> Optional[dict]: + """Find the secret that matches the input data.""" + secrets_dir = Path(os.environ.get("NTBL_SECRETS_DIR", "/vault/secrets")) + if not secrets_dir.exists(): + return None + + keys_to_match = ["host", "protocol", "path"] + for secret_path in secrets_dir.glob("*.git-cred"): + secret_data = json.loads(secret_path.read_text()) + meta = secret_data["meta"] + if all(meta[key] == input_data.get(key) for key in keys_to_match): + return secret_data["data"] + + return None + + +def main(stdin=sys.stdin, stdout=sys.stdout): + """Main entrypoint.""" + parsed_input = parse_input(stdin.read()) + if (secret := find_secret(parsed_input)) is not None: + print(format_output(secret), file=stdout) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/ipython_config.py b/ipython_config.py new file mode 100644 index 00000000..fb66a186 --- /dev/null +++ b/ipython_config.py @@ -0,0 +1,11 @@ +c.InteractiveShellApp.extensions = [ + "noteable_magics", +] + +c.SqlMagic.feedback = False +c.SqlMagic.autopandas = True +c.NTBLMagic.project_dir = "/etc/noteable/project" +c.NoteableDataLoaderMagic.return_head = False +c.IPythonKernel._execute_sleep = 0.15 +# 10 minutes to support large files +c.NTBLMagic.planar_ally_default_timeout_seconds = 600 \ No newline at end of file diff --git a/kernels/python/.pythonrc b/kernels/python/.pythonrc new file mode 100644 index 00000000..6a1eac85 --- /dev/null +++ b/kernels/python/.pythonrc @@ -0,0 +1,8 @@ +import pandas as pd + +import dx + +dx.set_option("DISPLAY_MAX_ROWS", 50_000) +dx.set_option("DISPLAY_MAX_COLUMNS", 100) +dx.set_option("ENABLE_DATALINK", True) +dx.set_option("ENABLE_ASSIGNMENT", False) \ No newline at end of file diff --git a/kernels/python/Dockerfile b/kernels/python/Dockerfile new file mode 100644 index 00000000..6ee0fda6 --- /dev/null +++ b/kernels/python/Dockerfile @@ -0,0 +1,102 @@ +# syntax = docker/dockerfile:1.2.1 +ARG PYTHON_VERSION +FROM jupyter/datascience-notebook:python-${PYTHON_VERSION} + +USER root + +# datascience-notebook:python-3.9.13 includes psutil 5.9.2 with cooked C lib, but +# later pip installs end up installing 5.9.4, but for some +# reason 'import psutil' will end up getting the python 5.9.4 but the +# C lib from 5.9.2, and, unlike Smeagol, it hateses the precious. +RUN pip uninstall -y psutil + +# Set up log file for magics +RUN touch /var/log/noteable_magics.log && \ + chown 4004:4004 /var/log/noteable_magics.log + +# When image is run, run the code with the environment +# activated: +SHELL ["/bin/bash", "-c"] + +WORKDIR /tmp + +# hadolint ignore=DL3008,DL3015 +RUN apt-get update && \ + apt-get install -y jq procps git unixodbc-dev g++ \ + && rm -rf /var/lib/apt/lists/* + +ENV NB_USER="noteable" \ + NB_UID=4004 \ + NB_GID=4004 + +# Create the default unprivileged user +RUN groupadd --gid 4004 noteable && \ + useradd --uid 4004 --shell /bin/false --create-home --no-log-init --gid noteable noteable && \ + chown --recursive noteable:noteable /home/noteable + +RUN mkdir /etc/ipython && chown noteable:noteable /etc/ipython +RUN mkdir -p /etc/noteable && chown noteable:noteable /etc/noteable + +RUN chown noteable:noteable "${JULIA_PKGDIR}" && \ + chown noteable:noteable "${CONDA_DIR}" && \ + fix-permissions "${JULIA_PKGDIR}" && \ + fix-permissions "${CONDA_DIR}" + +# Run non-privileged user +USER noteable + +ENV PATH="/home/noteable/.local/bin:${PATH}" \ + HOME="/home/noteable" \ + XDG_CACHE_HOME="/home/noteable/.cache/" \ + GOOGLE_APPLICATION_CREDENTIALS="/vault/secrets/gcp-credentials" + +# hadolint ignore=DL3045 +COPY environment.txt ./ + +# hadolint ignore=SC2034 +RUN mamba install --file environment.txt + +# hadolint ignore=DL3045 +COPY requirements.txt ./ + +# hadolint ignore=SC1008,SC2155,DL3042,SC2102 +RUN pip install -I --no-cache-dir -r requirements.txt + +# Copy over any python commands that need to run on startup +# that aren't covered by IPython extensions +COPY .pythonrc /home/noteable/.pythonrc + +# Enable the widgets nbextension +# hadolint ignore=SC1008 +RUN jupyter nbextension enable --py --sys-prefix widgetsnbextension + +# Smoke test to ensure packages were installed properly +# hadolint ignore=SC1008 +RUN python -c "import noteable_magics, psutil" + +RUN git config --global user.name "Noteable Kernel" && \ + git config --global user.email "engineering@noteable.io" && \ + git config --global safe.directory /etc/noteable/project && \ + git config --global credential.helper /git_credential_helper.py && \ + git config --global credential.useHttpPath true + +# https://ipython.readthedocs.io/en/stable/config/intro.html#systemwide-configuration +COPY ipython_config.py /etc/ipython + +# Set standard working directory for noteable project +WORKDIR /etc/noteable/project + +# Add the entrypoint script to the $PATH +COPY run.sh /usr/local/bin +COPY secrets_helper.py /tmp/secrets_helper.py +COPY git_credential_helper.py /git_credential_helper.py +COPY git-wrapper.sh /usr/local/bin/git + +EXPOSE 50001-50005 + +# Use tini to manage passing signals to the child kernel process +# -g will ensure signals are passed to the entire child process *group*, +# not just the immediate child process (bash) +# https://github.com/krallin/tini#process-group-killing +ENTRYPOINT ["tini", "-g", "--"] +CMD ["run.sh"] diff --git a/kernels/python/environment.txt b/kernels/python/environment.txt new file mode 100644 index 00000000..20d65f2f --- /dev/null +++ b/kernels/python/environment.txt @@ -0,0 +1,7 @@ +jupyter_client=7.3.* +ipython=8.0.* +vdom=0.6 +papermill=2.2.* +ipywidgets=7.6.* +plotly=4.14.3 +geopandas=0.11.0 \ No newline at end of file diff --git a/kernels/python/run.sh b/kernels/python/run.sh new file mode 100755 index 00000000..6fdb4434 --- /dev/null +++ b/kernels/python/run.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +set -o pipefail +set -o nounset +set -o errexit + +echo "Local time: $(date)" + +set -x + +connection_file=/tmp/connection_file.json + +cp /etc/noteable/connections/connection_file.json ${connection_file} + +kernel_name=$(jq -r .kernel_name /tmp/connection_file.json) + +# Inject Secrets into environment (see script docstring for more info) +# set +x to avoid echoing the Secrets in plaintext to logs +set +x +echo "Injecting Secrets into environment, echoing is turned off" +eval "$(python /tmp/secrets_helper.py)" +echo "Done injecting Secrets, turning echoing back on" +set -x + +case $kernel_name in + + python | python3) + echo "Starting Python kernel" + # https://docs.python.org/3/using/cmdline.html#envvar-PYTHONSTARTUP + export PYTHONSTARTUP=~/.pythonrc + exec python -m ipykernel_launcher -f ${connection_file} --debug + ;; + + ir) + echo "Starting R kernel" + exec R --slave -e "IRkernel::main()" --args ${connection_file} + ;; + + julia | julia-1.6) + echo "Starting Julia kernel" + # project path necessary to keep julia form using its defaults + exec julia -i --color=yes --project=/etc/noteable/project /opt/julia/packages/IJulia/e8kqU/src/kernel.jl ${connection_file} + ;; + + *) + echo "Unrecognized '$kernel_name' kernel, falling back to Python" + # https://docs.python.org/3/using/cmdline.html#envvar-PYTHONSTARTUP + export PYTHONSTARTUP=~/.pythonrc + exec python -m ipykernel_launcher -f ${connection_file} --debug + ;; +esac \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..110ef814 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,12 @@ +# Scheduler/ochestration packages +dagstermill==0.16.15 +papermill-origami==0.0.9 +cloudpickle==2.2.0 +flytekitplugins-papermill==1.2.4 + +# https://github.com/noteable-io/ packages +git+https://www.github.com/noteable-io/dx.git@a7df2821182293546d7d7a9ede3cdcc0c946d570 +git+https://www.github.com/noteable-io/noteable-notebook-magics.git@a6a6801da8ea7ccb72b9f354d4780699c3d99f73 +git+https://www.github.com/noteable-io/sidecar_comms.git@6ee04efe60b855c465727f120f8f50a7bfa60097 + +# (All of the datasources modules are now explicit requirements within noteable-notebook-magics.) \ No newline at end of file diff --git a/secrets_helper.py b/secrets_helper.py new file mode 100644 index 00000000..a832841a --- /dev/null +++ b/secrets_helper.py @@ -0,0 +1,30 @@ +""" +This script helps inject Secrets into the Kernel environment. +The Vault Agent will volume mount files into the Kernel container +at /vault/secrets. Noteable Secrets will be in .env suffix files. +We want to parse all those files and export them as environment variables +in the bash script that kicks off the Kernel (ipykernel_launcher etc). +Doing that scripting in bash is a pain, so we do it in Python here and +bash just does an `eval` on the output. +Some defensive programming to highlight: + - Env vars in the output are all uppercased + - If an env var is already set, we don't overwrite it + - We use shlex to quote the output so bash eval does not cause nasty side effects +""" +import os +import pathlib +import shlex + +output = [] + +secrets_directory = os.environ.get("VAULT_SECRETS_PATH", "/vault/secrets") + +directory = pathlib.Path(secrets_directory) +if directory.exists(): + for file in directory.glob("*.env"): + name = file.stem.upper() + if name not in os.environ: + content = file.read_text() + output.append(f"export {name}={shlex.quote(content)}") + +print("\n".join(output)) \ No newline at end of file