diff --git a/.github/actions/docker-build-and-push/action.yml b/.github/actions/docker-build-and-push/action.yml new file mode 100644 index 0000000..2e94251 --- /dev/null +++ b/.github/actions/docker-build-and-push/action.yml @@ -0,0 +1,53 @@ +name: Docker Build & Push +description: Composite GitHub Action to build and push Docker images to the DLCS GitHub Packages repositories. + +inputs: + image-name: + description: "Name of the image to push to the GHCR repository." + required: true + dockerfile: + description: "The Dockerfile to build and push." + required: true + context: + description: "The context to use when building the Dockerfile." + required: true + github-token: + description: "The GitHub token used when interacting with GCHR." + required: true + +runs: + using: "composite" + steps: + - id: checkout + uses: actions/checkout@v2 + - id: docker-setup-buildx + uses: docker/setup-buildx-action@v2 + with: + driver-opts: | + image=moby/buildkit:v0.10.6 + - id: docker-meta + uses: docker/metadata-action@v4 + with: + images: ghcr.io/dlcs/${{ inputs.image-name }} + tags: | + type=ref,event=branch + type=ref,event=pr + type=sha,enable=true,prefix=,format=long + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=semver,pattern={{major}} + - id: docker-login + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ inputs.github-token }} + - id: docker-build-push + uses: docker/build-push-action@v4 + with: + context: ${{ inputs.context }} + file: ${{ inputs.dockerfile }} + builder: ${{ steps.docker-setup-buildx.outputs.name }} + tags: ${{ steps.docker-meta.outputs.tags }} + labels: ${{ steps.docker-meta.outputs.labels }} + push: ${{ github.actor != 'dependabot[bot]' }} diff --git a/.github/workflows/build-deploy.yml b/.github/workflows/build-deploy.yml index 9c3bd76..3c5d81d 100644 --- a/.github/workflows/build-deploy.yml +++ b/.github/workflows/build-deploy.yml @@ -11,45 +11,25 @@ on: - master jobs: - build-push: + build-push-varnish: runs-on: ubuntu-latest - steps: - - name: Check out code - id: checkout - uses: actions/checkout@v2 - - - name: Set up Docker Buildx - id: buildx - uses: docker/setup-buildx-action@v2 - - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 + - uses: actions/checkout@v3 + - uses: ./.github/actions/docker-build-and-push + name: build and push with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} + image-name: "dlcs-varnish" + dockerfile: "Dockerfile" + context: "." + github-token: ${{ secrets.GITHUB_TOKEN }} - - name: Docker meta - id: docker_meta - uses: docker/metadata-action@v4 - with: - images: ghcr.io/dlcs/dlcs-varnish - tags: | - type=ref,event=branch - type=ref,event=pr - type=semver,pattern={{version}} - type=semver,pattern={{major}}.{{minor}} - type=sha,enable=true,prefix=,format=long - - - name: Build and push - id: docker_build - uses: docker/build-push-action@v4 + build-push-dlcs-varnish-cleanup: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: ./.github/actions/docker-build-and-push with: - context: . - builder: ${{ steps.buildx.outputs.name }} - push: true - labels: ${{ steps.docker_meta.outputs.labels }} - tags: ${{ steps.docker_meta.outputs.tags }} - - \ No newline at end of file + image-name: "dlcs-varnish-cleanup" + dockerfile: "varnish-cleanup/Dockerfile" + context: "./varnish-cleanup" + github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d4a2f95 --- /dev/null +++ b/.gitignore @@ -0,0 +1,165 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +Scripts/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +# General +*.exe \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index d3cefb4..2fca67d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,6 +10,13 @@ RUN pip install awscli COPY start.sh /start.sh RUN chmod +x /start.sh +WORKDIR /usr/app/src +COPY varnish-cleanup/requirements.txt ./ +RUN pip install -r requirements.txt + +COPY varnish-cleanup/cleanup_handler.py ./ +COPY varnish-cleanup/app ./app + ENV VARNISH_PORT 80 EXPOSE 80 diff --git a/README.md b/README.md index 94b4dd9..7dfcae4 100644 --- a/README.md +++ b/README.md @@ -2,17 +2,25 @@ Docker image using disk-backed Varnish instance for caching. -On startup it uses the AWS CLI to copy vcl file from location specified by `S3_VCL_FILE` environment variable. +On startup it uses the AWS CLI to copy vcl file from location specified by `S3_VCL_FILE` environment variable. +Optionally, a local file can also be configured for development purposes ## Configuration -The following environment files are expected: - +The following environment settings are expected: * `S3_VCL_FILE` - The location of a vcl file to use. Expected S3Uri as it is used by [aws s3 cp](https://docs.aws.amazon.com/cli/latest/reference/s3/cp.html) command. * `VARNISH_CACHE_FOLDER` - Folder where disk backed cache is stored. * `VARNISH_CACHE_SIZE` - Size of cache. -* `VARNISH_PORT` - Which port Varnish should listen on (defaults to 80) +* `AWS_PROFILE` - Required to run locally +* `INCOMING_QUEUE` - the name of the queue that the cleanup handler listens to + +The following configuration is optional: +* `VARNISH_ADDRESS` - The location of varnish used by the cleanup handler. Defaults to localhost +* `AWS_REGION` - The AWS region. Defaults to eu-west-1 +* `USE_LOCAL_CONFIG` - Whether to use a local config file over S3. + +*NOTE:* using `USE_LOCAL_CONFIG` requires a `mount`to be added to the `docker run` containing the VCL ## Running ```bash @@ -21,10 +29,34 @@ docker build -t dlcs-varnish:local . # run docker run -it --rm \ - --env AWS_ACCESS_KEY_ID='xxx' \ - --env AWS_SECRET_ACCESS_KEY='xxx' \ --env S3_VCL_FILE='s3://my-bucket/varnish-config.vcl' \ --env VARNISH_CACHE_FOLDER='/path/to/folder' \ - --env VARNISH_CACHE_SIZE='100M' + --env VARNISH_CACHE_SIZE='100M' \ + --env-file='/path/to/env' \ + {REQUIRED FOR LOCAL RUNNING}--volume $HOME\.aws\credentials:/root/.aws/credentials:ro \ + {OPTIONAL}--mount type=bind,source=.\etc\default.vcl,target=/mnt/varnish/default.vcl \ + dlcs-varnish:local +``` +# varnish-cleanup + +Additionally, there is a standalone docker container for the cleanup handler. + +## Configuration + +Required: +* `AWS_PROFILE` - Required to run locally + +Optional: +* `VARNISH_ADDRESS` - The location of varnish used by the cleanup handler. Defaults to localhost +* `AWS_REGION` - The region used by the cleanup handler. Defaults to eu-west-1 + +```bash +# build +docker build -t dlcs-varnish-cleanup:local ./varnish-cleanup + +# run +docker run -it --rm \ + --env-file='/path/to/env' + {REQUIRED FOR LOCAL RUNNING}--volume=$HOME\.aws\credentials:/root/.aws/credentials:ro dlcs-varnish:local ``` \ No newline at end of file diff --git a/etc/default.vcl b/etc/default.vcl index 1c86f61..24f7616 100644 --- a/etc/default.vcl +++ b/etc/default.vcl @@ -1,13 +1,26 @@ +vcl 4.1; # See: https://www.varnish-software.com/developers/tutorials/#vcl backend default { - .host = "${BACKEND_HOST}"; - .port = "${BACKEND_PORT}"; + .host = "127.0.0.1"; + .port = "80"; } sub vcl_recv { - set req.backend = default; - return(lookup); + set req.backend_hint = default; + + if (req.method == "BAN") { + + set req.url = regsub(req.url, "^(\/)", ""); + + ban("obj.http.x-asset-id == " + req.url); + + # Throw a synthetic page so the + # request won't go to the backend. + return(synth(200, "Ban added")); + } + + return(hash); } sub vcl_miss { @@ -18,7 +31,7 @@ sub vcl_hit { return(deliver); } -sub vcl_fetch { +sub vcl_backend_response { # Get the response. Set the cache lifetime of the response to 1 hour. set beresp.ttl = 1h; @@ -30,5 +43,15 @@ sub vcl_fetch { } sub vcl_deliver { + # Add debug header to see if it's a HIT/MISS and the number of hits, disable when not needed + if (obj.hits > 0) { + set resp.http.X-Cache = "HIT"; + } else { + set resp.http.X-Cache = "MISS"; + } + + # Set hits in X-Cache-Hits header + set resp.http.X-Cache-Hits = obj.hits; + return(deliver); } diff --git a/start.sh b/start.sh index 41162e8..391e6f2 100755 --- a/start.sh +++ b/start.sh @@ -1,6 +1,13 @@ #!/bin/bash -aws s3 cp ${S3_VCL_FILE} /etc/varnish/default.vcl +if [ "$USE_LOCAL_CONFIG" = true ] +then + echo 'Using local config!' + cp /mnt/varnish/default.vcl /etc/varnish/default.vcl +else + echo 'Using s3 config!' + aws s3 cp ${S3_VCL_FILE} /etc/varnish/default.vcl +fi RELOAD_VCL=1 @@ -10,4 +17,8 @@ mkdir -p ${VARNISH_CACHE_FOLDER} varnishd -a 0.0.0.0:${VARNISH_PORT} -T 127.0.0.1:6082 -f /etc/varnish/default.vcl -s file,${VARNISH_CACHE_FOLDER}/varnish_cache.bin,${VARNISH_CACHE_SIZE} -varnishlog +varnishlog & + +# Start varnish cleanup + +python3 /usr/app/src/cleanup_handler.py diff --git a/varnish-cleanup/Dockerfile b/varnish-cleanup/Dockerfile new file mode 100644 index 0000000..215e6cb --- /dev/null +++ b/varnish-cleanup/Dockerfile @@ -0,0 +1,18 @@ +#Deriving the latest base image +FROM python:latest + + +#Labels as key value pair +LABEL Maintainer="digirati" +LABEL org.opencontainers.image.source=https://github.com/dlcs/dlcs-varnish-cleanup +LABEL org.opencontainers.image.description="Performs bans on varnish" + +RUN pip install pipenv +WORKDIR /usr/app/src +COPY requirements.txt ./ +RUN pip install -r requirements.txt + +COPY cleanup_handler.py ./ +COPY app ./app + +CMD [ "python", "./cleanup_handler.py"] \ No newline at end of file diff --git a/varnish-cleanup/Test/Json/FullRequest.json b/varnish-cleanup/Test/Json/FullRequest.json new file mode 100644 index 0000000..c05fb37 --- /dev/null +++ b/varnish-cleanup/Test/Json/FullRequest.json @@ -0,0 +1,60 @@ +{ + "asset": { + "batch": 0, + "created": "2023-08-07T09:27:35.487625Z", + "customer": 26, + "deliveryChannels": [ + "iiif-img", + "iiif-av", + "file" + ], + "duration": 0, + "error": "", + "family": 73, + "finished": "2023-08-07T09:27:37.248125Z", + "fullImageOptimisationPolicy": { + "customer": 0, + "global": false, + "id": null, + "name": null, + "technicalDetails": null + }, + "fullThumbnailPolicy": null, + "height": 4149, + "id": { + "asset": "54378677", + "customer": 26, + "space": 18 + }, + "imageOptimisationPolicy": "fast-higher", + "ingesting": false, + "initialOrigin": null, + "maxUnauthorised": -1, + "mediaType": "image/jpg", + "notForDelivery": false, + "numberReference1": 0, + "numberReference2": 0, + "numberReference3": 0, + "origin": "https://www.nasa.gov/sites/default/files/thumbnails/image/as11-44-6552.jpeg", + "preservedUri": "", + "reference1": "", + "reference2": "", + "reference3": "", + "requiresAuth": false, + "roles": "", + "rolesList": [ + ], + "space": 18, + "tags": "sb", + "tagsList": [ + "sb" + ], + "thumbnailPolicy": "default", + "width": 4149 + }, + "customerPathElement": { + "id": 26, + "name": "jacklewis" + } + } + diff --git a/varnish-cleanup/Test/conftest.py b/varnish-cleanup/Test/conftest.py new file mode 100644 index 0000000..8c29465 --- /dev/null +++ b/varnish-cleanup/Test/conftest.py @@ -0,0 +1,23 @@ +import os +import boto3 +from moto import mock_sqs +import cleanup_handler +import pytest +@pytest.fixture(scope="function") +def aws_credentials(): + """Mocked AWS Credentials for moto.""" + os.environ["AWS_ACCESS_KEY_ID"] = "testing" + os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" + os.environ["AWS_SECURITY_TOKEN"] = "testing" + os.environ["AWS_SESSION_TOKEN"] = "testing" + os.environ["AWS_DEFAULT_REGION"] = "us-east-1" + +@pytest.fixture(scope="function") +def sqs(aws_credentials): + with mock_sqs(): + yield boto3.resource("sqs", region_name="us-east-1") + +@pytest.fixture(scope="function") +def sqs_client(aws_credentials): + with mock_sqs(): + yield boto3.client("sqs", region_name="us-east-1") \ No newline at end of file diff --git a/varnish-cleanup/Test/test_cleanup_handler.py b/varnish-cleanup/Test/test_cleanup_handler.py new file mode 100644 index 0000000..99db25c --- /dev/null +++ b/varnish-cleanup/Test/test_cleanup_handler.py @@ -0,0 +1,75 @@ +import os +import pytest +import requests_mock + +import boto3 +from time import sleep +from threading import Thread +from moto import mock_sqs +from unittest.mock import patch +import cleanup_handler +import mock +import json +from app.signal_handler import SignalHandler +from threading import Thread + +@mock_sqs +def test_write_message_valid(sqs): + "Test the start_monitoring_queue method with a valid message" + # Arrange + name = 'test-delete-notifications' + queue = sqs.create_queue(QueueName=name) + cleanup_handler.INCOMING_QUEUE = name + cleanup_handler.MONITOR_SLEEP_SECS = 1 + with open('Json/FullRequest.json', 'r') as file: + data = file.read().replace('\n', '') + queue.send_message(QueueUrl=queue.url, MessageBody=(data)) + signal_handler = SignalHandler() + + # Act + t = Thread(target=cleanup_handler.start_monitoring_queue, args=(sqs, signal_handler,)) + t.start() + # wait for messages to be handled + sleep(2) + + # Assert + sqs_messages = queue.receive_messages() + assert len(sqs_messages) == 0, "queue should have no messages" + + t.join(0.1) + +def test_receive_message_bans(): + "Test the receive_message method with a valid message" + # Arrange + with open('Json/FullRequest.json', 'r') as file: + data = file.read().replace('\n', '') + + sqs_mock_message = mock.Mock() + sqs_mock_message.body = data + + with requests_mock.Mocker() as mo: + mo.request("BAN", url="http://localhost/26/18/54378677") + + # Act + response = cleanup_handler._handle_message(sqs_mock_message) + + # Assert + assert response == True, "response should be a success" + +def test_receive_message_bans_fail(): + "Test the receive_message method with a valid message" + # Arrange + with open('Json/FullRequest.json', 'r') as file: + data = file.read().replace('\n', '') + + sqs_mock_message = mock.Mock() + sqs_mock_message.body = data + + with requests_mock.Mocker() as mo: + mo.request("BAN", url="http://localhost/26/18/54378677", status_code=400) + + # Act + response = cleanup_handler._handle_message(sqs_mock_message) + + # Assert + assert response == False, "response should be a failure" \ No newline at end of file diff --git a/varnish-cleanup/app/aws_factory.py b/varnish-cleanup/app/aws_factory.py new file mode 100644 index 0000000..f8564ed --- /dev/null +++ b/varnish-cleanup/app/aws_factory.py @@ -0,0 +1,19 @@ +import boto3 +from moto import sqs + +from app.settings import LOCALSTACK, REGION, LOCALSTACK_ADDRESS +from logzero import logger + +def get_aws_resource(resource_type: str): + """Get an aws resource configured to use LocalStack if env var is set""" + if LOCALSTACK: + logger.warn(f"Using localstack for {resource_type} resource") + return boto3.resource( + resource_type, + region_name=REGION, + endpoint_url=LOCALSTACK_ADDRESS, + aws_access_key_id="foo", + aws_secret_access_key="bar", + ) + else: + return boto3.resource(resource_type, REGION) \ No newline at end of file diff --git a/varnish-cleanup/app/settings.py b/varnish-cleanup/app/settings.py new file mode 100644 index 0000000..034dba7 --- /dev/null +++ b/varnish-cleanup/app/settings.py @@ -0,0 +1,17 @@ +import os + +def _get_boolean(env_name: str, fallback: str) -> bool: + return os.environ.get(env_name, fallback).lower() in ("true", "t", "1") + +MONITOR_SLEEP_SECS = float(os.environ.get("MONITOR_SLEEP_SECS", 30)) + +# AWS +REGION = os.environ.get("AWS_REGION", "eu-west-1") +INCOMING_QUEUE = os.environ.get("INCOMING_QUEUE") + +# LocalStack +LOCALSTACK = _get_boolean("LOCALSTACK", "False") +LOCALSTACK_ADDRESS = os.environ.get("LOCALSTACK_ADDRESS", "http://localhost:4566") + +# varnish +VARNISH_ADDRESS = os.environ.get("VARNISH_ADDRESS", "http://localhost") \ No newline at end of file diff --git a/varnish-cleanup/app/signal_handler.py b/varnish-cleanup/app/signal_handler.py new file mode 100644 index 0000000..614e00e --- /dev/null +++ b/varnish-cleanup/app/signal_handler.py @@ -0,0 +1,27 @@ +import signal + +from logzero import logger + + +class SignalHandler: + """Handles sigterm and sigint events""" + + def __init__(self): + self._cancellation_requested = False + self._setup_signal_handling() + + def cancellation_requested(self): + """ + Verify if lifecycle is to continue + :return: True if cancellation requested, else False + """ + return self._cancellation_requested + + def _signal_handler(self, signum, frame): + logger.info(f"Caught signal {signum}. Cancellation requested") + self._cancellation_requested = True + + def _setup_signal_handling(self): + logger.info("setting up signal handling") + signal.signal(signal.SIGTERM, self._signal_handler) + signal.signal(signal.SIGINT, self._signal_handler) \ No newline at end of file diff --git a/varnish-cleanup/cleanup_handler.py b/varnish-cleanup/cleanup_handler.py new file mode 100644 index 0000000..2c3b617 --- /dev/null +++ b/varnish-cleanup/cleanup_handler.py @@ -0,0 +1,82 @@ +import json +import traceback +import time +import requests + +from logzero import logger +from app.aws_factory import get_aws_resource +from app.settings import INCOMING_QUEUE, MONITOR_SLEEP_SECS, VARNISH_ADDRESS +from app.signal_handler import SignalHandler + + +def start_monitoring(): + sqs = get_aws_resource("sqs") + signal_handler = SignalHandler() + start_monitoring_queue(sqs, signal_handler) + + +def start_monitoring_queue(sqs, signal_handler): + incoming_queue = sqs.get_queue_by_name(QueueName=INCOMING_QUEUE) + + logger.info(f"starting monitoring queue '{INCOMING_QUEUE}'") + + try: + while not signal_handler.cancellation_requested(): + message_received = False + for message in _get_messages_from_queue(incoming_queue): + if message and not signal_handler.cancellation_requested(): + message_received = True + try: + if _handle_message(message): + message.delete() + except Exception: + e = traceback.format_exc() + logger.error(f"Error processing message: {e}") + else: + message_received = False + + if not message_received and not signal_handler.cancellation_requested(): + time.sleep(MONITOR_SLEEP_SECS) + except Exception as e: + logger.error(f"Error getting messages: {e}") + raise e + + logger.info(f"stopped monitoring queue '{INCOMING_QUEUE}'...") + + +def _get_messages_from_queue(queue): + return queue.receive_messages(WaitTimeSeconds=20, MaxNumberOfMessages=10) + + +def _handle_message(received_message): + logger.debug(received_message) + message = json.loads(received_message.body) + id = _convert_asset_id(message["asset"]["id"]) + success = True + + varnishUrl = f"{VARNISH_ADDRESS}/{id}" + + response = requests.request( + "BAN", + url=varnishUrl + ) + + if response.ok: + logger.debug(f"banned {id}") + else: + success = False + logger.error(f"failed to ban {id} - {response.content}") + + return success + + +def _convert_asset_id(id): + customer = id["customer"] + space = id["space"] + asset = id["asset"] + + return str(customer) + "/" + str(space) + "/" + str(asset) + + +if __name__ == "__main__": + start_monitoring() diff --git a/varnish-cleanup/requirements.txt b/varnish-cleanup/requirements.txt new file mode 100644 index 0000000..9da4e97 Binary files /dev/null and b/varnish-cleanup/requirements.txt differ