diff --git a/workflows/cron/README.md b/workflows/cron/README.md new file mode 100644 index 000000000..8a5700998 --- /dev/null +++ b/workflows/cron/README.md @@ -0,0 +1,28 @@ +# Contents: + +- [cron-stac-validate-fast](#cron-stac-validate-fast) +- [cron-stac-validate-full](#cron-stac-validate-full) + +# STAC validation + +The goal of the following [Cron Workflows](https://argo-workflows.readthedocs.io/en/stable/cron-workflows/) is to check the validity of the STAC metadata published in the AWS Open Data Registries [NZ Elevation](https://registry.opendata.aws/nz-elevation/) and [NZ Imagery](https://registry.opendata.aws/nz-imagery/). + +> **_NOTE:_** To simplify the overall workflow deployment process, these `CronWorkflow`s have one main task per registry. It looks like a duplication that could be avoided but as we are not using [`argo` CLI](https://argo-workflows.readthedocs.io/en/stable/walk-through/argo-cli/) to deploy the workflows - which allows parameter passing - we could not deploy one `CronWorkflow` per `uri` (or registry). + +## cron-stac-validate-fast + +Workflow that validates the STAC metadata by calling the [`stac-validate` argo-tasks command](https://github.com/linz/argo-tasks/blob/master/README.md#stac-validate) using the [`tpl-at-stac-validate`](https://github.com/linz/topo-workflows/blob/master/templates/argo-tasks/README.md#argo-tasksstac-validate). + +It does verify that the [STAC links](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md#link-object) are valid. + +- schedule: **every day at 5am** + +## cron-stac-validate-full + +Workflow that validates the STAC metadata by calling the [`stac-validate` argo-tasks command](https://github.com/linz/argo-tasks/blob/master/README.md#stac-validate) using the [`stac-validate-parallel`](https://github.com/linz/topo-workflows/blob/master/workflows/stac/README.md#stac-validate-parallel). + +It also validate that the [STAC assets](https://github.com/radiantearth/stac-spec/blob/master/item-spec/item-spec.md#assets) are valid. Verifying all asset (TIFF files) checksums is expensive, so this workflow is run less often than [cron-stac-validate-fast](#cron-stac-validate-fast). + +> **_NOTE:_** Due to the parallelism design, this workflow does not validate the root parent `catalog.json` in order to validate each `collection.json` separately. This is not an issue as the `catalog.json` does not contain any `asset` and is already validated by the [cron-stac-validata-fast](#cron-stac-validate-fast) job. + +- schedule: **every 1st of the month** diff --git a/workflows/cron/cron-stac-validate-fast.yaml b/workflows/cron/cron-stac-validate-fast.yaml new file mode 100644 index 000000000..dc1bdba52 --- /dev/null +++ b/workflows/cron/cron-stac-validate-fast.yaml @@ -0,0 +1,52 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/argoproj/argo-workflows/v3.5.5/api/jsonschema/schema.json +apiVersion: argoproj.io/v1alpha1 +kind: CronWorkflow +metadata: + name: cron-stac-validate-fast + labels: + linz.govt.nz/category: stac +spec: + schedule: '0 05 * * *' # 5 AM every day + timezone: 'NZ' + startingDeadlineSeconds: 3600 # Allow 1 hour delay if the workflow-controller clashes during the starting time. + concurrencyPolicy: 'Allow' + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 3 + suspend: false + workflowSpec: + entrypoint: main + arguments: + parameters: + - name: checksum_assets + value: 'false' + - name: 'checksum_links' + value: 'true' + templates: + - name: main + retryStrategy: + limit: '0' + steps: + - - name: stac-validate-imagery + templateRef: + name: tpl-at-stac-validate + template: main + arguments: + parameters: + - name: 'uri' + value: 's3://nz-imagery/catalog.json' + - name: 'checksum_assets' + value: '{{workflow.parameters.checksum_assets}}' + - name: 'checksum_links' + value: '{{workflow.parameters.checksum_assets}}' + - name: stac-validate-elevation + templateRef: + name: tpl-at-stac-validate + template: main + arguments: + parameters: + - name: 'uri' + value: 's3://nz-elevation/catalog.json' + - name: 'checksum_assets' + value: '{{workflow.parameters.checksum_assets}}' + - name: 'checksum_links' + value: '{{workflow.parameters.checksum_assets}}' diff --git a/workflows/cron/cron-stac-validate-full.yaml b/workflows/cron/cron-stac-validate-full.yaml new file mode 100644 index 000000000..d3496ff82 --- /dev/null +++ b/workflows/cron/cron-stac-validate-full.yaml @@ -0,0 +1,64 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/argoproj/argo-workflows/v3.5.5/api/jsonschema/schema.json +apiVersion: argoproj.io/v1alpha1 +kind: CronWorkflow +metadata: + name: cron-stac-validate-full + labels: + linz.govt.nz/category: stac +spec: + schedule: '0 05 1 * *' # 5 AM every 1st of the month + timezone: 'NZ' + startingDeadlineSeconds: 3600 # Allow 1 hour delay if the workflow-controller clashes during the starting time. + concurrencyPolicy: 'Allow' + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 3 + suspend: false + workflowSpec: + entrypoint: main + arguments: + parameters: + - name: version_argo_tasks + value: 'v4' + - name: 'include' + value: 'collection.json$' + - name: checksum_assets + value: 'true' + - name: 'checksum_links' + value: 'true' + templates: + - name: main + retryStrategy: + limit: '0' + steps: + - - name: stac-validate-imagery + templateRef: + name: stac-validate-parallel + template: main + arguments: + parameters: + - name: version_argo_tasks + value: '{{workflow.parameters.version_argo_tasks}}' + - name: 'uri' + value: 's3://nz-imagery/' + - name: include + value: '{{workflow.parameters.include}}' + - name: 'checksum_assets' + value: '{{workflow.parameters.checksum_assets}}' + - name: 'checksum_links' + value: '{{workflow.parameters.checksum_assets}}' + - name: stac-validate-elevation + templateRef: + name: stac-validate-parallel + template: main + arguments: + parameters: + - name: version_argo_tasks + value: '{{workflow.parameters.version_argo_tasks}}' + - name: 'uri' + value: 's3://nz-elevation/' + - name: include + value: '{{workflow.parameters.include}}' + - name: 'checksum_assets' + value: '{{workflow.parameters.checksum_assets}}' + - name: 'checksum_links' + value: '{{workflow.parameters.checksum_assets}}' diff --git a/workflows/stac/stac-validate-parallel.yaml b/workflows/stac/stac-validate-parallel.yaml index d0d75b57b..ab949ba7d 100644 --- a/workflows/stac/stac-validate-parallel.yaml +++ b/workflows/stac/stac-validate-parallel.yaml @@ -38,12 +38,32 @@ spec: image: '' templates: - name: main + inputs: + parameters: + - name: version_argo_tasks + value: '{{workflow.parameters.version_argo_tasks}}' + - name: include + value: '{{workflow.parameters.include}}' + - name: uri + value: '{{workflow.parameters.uri}}' + - name: checksum_assets + value: '{{workflow.parameters.checksum_assets}}' + - name: checksum_links + value: '{{workflow.parameters.checksum_links}}' retryStrategy: limit: '0' # avoid retrying any of the following task as `tpl-at-stac-validate` already retries its own tasks. dag: tasks: - name: aws-list-collections template: aws-list-collections + arguments: + parameters: + - name: version_argo_tasks + value: '{{inputs.parameters.version_argo_tasks}}' + - name: include + value: '{{inputs.parameters.include}}' + - name: uri + value: '{{inputs.parameters.uri}}' - name: stac-validate-collections templateRef: name: tpl-at-stac-validate @@ -53,16 +73,21 @@ spec: - name: uri value: '{{item}}' - name: checksum_assets - value: '{{workflow.parameters.checksum_assets}}' + value: '{{inputs.parameters.checksum_assets}}' - name: checksum_links - value: '{{workflow.parameters.checksum_links}}' + value: '{{inputs.parameters.checksum_links}}' depends: aws-list-collections withParam: '{{tasks.aws-list-collections.outputs.parameters.files}}' - name: aws-list-collections + inputs: + parameters: + - name: version_argo_tasks + - name: include + - name: uri retryStrategy: limit: '2' # force retrying this specific task container: - image: '019359803926.dkr.ecr.ap-southeast-2.amazonaws.com/argo-tasks:{{=sprig.trim(workflow.parameters.version_argo_tasks)}}' + image: '019359803926.dkr.ecr.ap-southeast-2.amazonaws.com/argo-tasks:{{=sprig.trim(inputs.parameters.version_argo_tasks)}}' command: [node, /app/index.js] env: - name: AWS_ROLE_CONFIG_PATH @@ -72,12 +97,12 @@ spec: 'list', '--verbose', '--include', - '{{=sprig.trim(workflow.parameters.include)}}', + '{{=sprig.trim(inputs.parameters.include)}}', '--group', '1', '--output', '/tmp/file_list.json', - '{{=sprig.trim(workflow.parameters.uri)}}', + '{{=sprig.trim(inputs.parameters.uri)}}', ] outputs: parameters: