Merge pull request #263 from nextstrain/update-curate

ingest: use new augur curate commands
nextstrain · Jul 16, 2024 · 1889dff · 1889dff
2 parents d6702c4 + 44545a2
commit 1889dff
Show file tree

Hide file tree

Showing 18 changed files with 126 additions and 624 deletions.
diff --git a/ingest/bin/ndjson-to-tsv-and-fasta b/ingest/bin/ndjson-to-tsv-and-fasta
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
@@ -26,6 +26,8 @@ curate:
   # These date formats should use directives expected by datetime
   # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
   expected_date_formats: ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ']
+  # The expected field that contains the GenBank geo_loc_name
+  genbank_location_field: location
   # Titlecase rules
   titlecase:
     # Abbreviations not cast to titlecase, keeps uppercase
@@ -70,7 +72,6 @@ curate:
     'date_submitted',
     'sra_accession',
     'abbr_authors',
-    'reverse',
     'authors',
     'institution'
   ]

diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
@@ -6,7 +6,7 @@ formats and expects input file
 
 This will produce output files as
 
-    metadata = "data/metadata_raw.tsv"
+    metadata = "data/subset_metadata.tsv"
     sequences = "results/sequences.fasta"
 
 Parameters are expected to be defined in `config.curate`.
@@ -49,7 +49,7 @@ rule curate:
         all_geolocation_rules="data/all-geolocation-rules.tsv",
         annotations=config["curate"]["annotations"],
     output:
-        metadata="data/metadata_raw.tsv",
+        metadata="data/all_metadata.tsv",
         sequences="results/sequences.fasta",
     log:
         "logs/curate.txt",
@@ -59,46 +59,59 @@ rule curate:
         strain_backup_fields=config["curate"]["strain_backup_fields"],
         date_fields=config["curate"]["date_fields"],
         expected_date_formats=config["curate"]["expected_date_formats"],
+        genbank_location_field=config["curate"]["genbank_location_field"],
         articles=config["curate"]["titlecase"]["articles"],
         abbreviations=config["curate"]["titlecase"]["abbreviations"],
         titlecase_fields=config["curate"]["titlecase"]["fields"],
         authors_field=config["curate"]["authors_field"],
         authors_default_value=config["curate"]["authors_default_value"],
         abbr_authors_field=config["curate"]["abbr_authors_field"],
         annotations_id=config["curate"]["annotations_id"],
-        metadata_columns=config["curate"]["metadata_columns"],
         id_field=config["curate"]["id_field"],
         sequence_field=config["curate"]["sequence_field"],
     shell:
         """
         (cat {input.sequences_ndjson} \
-            | ./vendored/transform-field-names \
+            | augur curate rename \
                 --field-map {params.field_map} \
             | augur curate normalize-strings \
-            | ./vendored/transform-strain-names \
+            | augur curate transform-strain-name \
                 --strain-regex {params.strain_regex} \
                 --backup-fields {params.strain_backup_fields} \
             | augur curate format-dates \
                 --date-fields {params.date_fields} \
                 --expected-date-formats {params.expected_date_formats} \
-            | ./vendored/transform-genbank-location \
+            | augur curate parse-genbank-location \
+                --location-field {params.genbank_location_field} \
             | augur curate titlecase \
                 --titlecase-fields {params.titlecase_fields} \
                 --articles {params.articles} \
                 --abbreviations {params.abbreviations} \
-            | ./vendored/transform-authors \
+            | augur curate abbreviate-authors \
                 --authors-field {params.authors_field} \
                 --default-value {params.authors_default_value} \
                 --abbr-authors-field {params.abbr_authors_field} \
-            | ./vendored/apply-geolocation-rules \
+            | augur curate apply-geolocation-rules \
                 --geolocation-rules {input.all_geolocation_rules} \
-            | ./vendored/merge-user-metadata \
+            | augur curate apply-record-annotations \
                 --annotations {input.annotations} \
                 --id-field {params.annotations_id} \
-            | ./bin/ndjson-to-tsv-and-fasta \
-                --metadata-columns {params.metadata_columns} \
-                --metadata {output.metadata} \
-                --fasta {output.sequences} \
-                --id-field {params.id_field} \
-                --sequence-field {params.sequence_field} ) 2>> {log}
+                --output-metadata {output.metadata} \
+                --output-fasta {output.sequences} \
+                --output-id-field {params.id_field} \
+                --output-seq-field {params.sequence_field} ) 2>> {log}
+        """
+
+
+rule subset_metadata:
+    input:
+        metadata="data/all_metadata.tsv",
+    output:
+        subset_metadata="data/subset_metadata.tsv",
+    params:
+        metadata_fields=",".join(config["curate"]["metadata_columns"]),
+    shell:
+        """
+        tsv-select -H -f {params.metadata_fields} \
+            {input.metadata} > {output.subset_metadata}
         """
diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
@@ -55,7 +55,7 @@ rule nextclade:
 rule join_metadata_clades:
     input:
         nextclade="data/nextclade.tsv",
-        metadata="data/metadata_raw.tsv",
+        metadata="data/subset_metadata.tsv",
         nextclade_field_map=config["nextclade"]["field_map"],
     output:
         metadata="results/metadata.tsv",

diff --git a/ingest/vendored/.cramrc b/ingest/vendored/.cramrc
diff --git a/ingest/vendored/.github/dependabot.yml b/ingest/vendored/.github/dependabot.yml
@@ -0,0 +1,17 @@
+# Dependabot configuration file
+# <https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file>
+#
+# Each ecosystem is checked on a scheduled interval defined below.  To trigger
+# a check manually, go to
+#
+#   https://github.com/nextstrain/ingest/network/updates
+#
+# and look for a "Check for updates" button.  You may need to click around a
+# bit first.
+---
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
diff --git a/ingest/vendored/.github/workflows/ci.yaml b/ingest/vendored/.github/workflows/ci.yaml
@@ -11,13 +11,5 @@ jobs:
   shellcheck:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: nextstrain/.github/actions/shellcheck@master
-
-  cram:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
-      - run: pip install cram
-      - run: cram tests/
diff --git a/ingest/vendored/.github/workflows/pre-commit.yaml b/ingest/vendored/.github/workflows/pre-commit.yaml
@@ -0,0 +1,14 @@
+name: pre-commit
+
+on:
+  - push
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - uses: pre-commit/[email protected]
diff --git a/ingest/vendored/.gitrepo b/ingest/vendored/.gitrepo
@@ -6,7 +6,7 @@
 [subrepo]
 	remote = https://github.com/nextstrain/ingest
 	branch = main
-	commit = a0faef53a0c6e7cc4057209454ef0852875dc3a9
-	parent = e7411323d0242b69d15b3a93185a6b683a388c28
+	commit = 258ab8ce898a88089bc88caee336f8d683a0e79a
+	parent = a1dd6dedc766e0ca82c893d1c6bd1118dec89889
 	method = merge
 	cmdver = 0.4.6
diff --git a/ingest/vendored/.pre-commit-config.yaml b/ingest/vendored/.pre-commit-config.yaml
@@ -0,0 +1,40 @@
+default_language_version:
+  python: python3
+repos:
+  - repo: https://github.com/pre-commit/sync-pre-commit-deps
+    rev: v0.0.1
+    hooks:
+      - id: sync-pre-commit-deps
+  - repo: https://github.com/shellcheck-py/shellcheck-py
+    rev: v0.10.0.1
+    hooks:
+      - id: shellcheck
+  - repo: https://github.com/rhysd/actionlint
+    rev: v1.6.27
+    hooks:
+      - id: actionlint
+        entry: env SHELLCHECK_OPTS='--exclude=SC2027' actionlint
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-ast
+      - id: check-case-conflict
+      - id: check-docstring-first
+      - id: check-json
+      - id: check-executables-have-shebangs
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+      - id: check-symlinks
+      - id: check-toml
+      - id: check-yaml
+      - id: destroyed-symlinks
+      - id: detect-private-key
+      - id: end-of-file-fixer
+      - id: fix-byte-order-marker
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.4.6
+    hooks:
+      # Run the linter.
+      - id: ruff
diff --git a/ingest/vendored/README.md b/ingest/vendored/README.md
@@ -2,7 +2,7 @@
 
 Shared internal tooling for pathogen data ingest.  Used by our individual
 pathogen repos which produce Nextstrain builds.  Expected to be vendored by
-each pathogen repo using `git subtree`.
+each pathogen repo using `git subrepo`.
 
 Some tools may only live here temporarily before finding a permanent home in
 `augur curate` or Nextstrain CLI.  Others may happily live out their days here.
@@ -117,15 +117,6 @@ Potential Nextstrain CLI scripts
 - [download-from-s3](download-from-s3) - Download file from AWS S3 bucket with decompression based on file extension in S3 URL.
   Skips download if the local file already exists and has a hash identical to the S3 object's metadata `sha256sum`.
 
-Potential augur curate scripts
-
-- [apply-geolocation-rules](apply-geolocation-rules) - Applies user curated geolocation rules to NDJSON records
-- [merge-user-metadata](merge-user-metadata) - Merges user annotations with NDJSON records
-- [transform-authors](transform-authors) - Abbreviates full author lists to '<first author> et al.'
-- [transform-field-names](transform-field-names) - Rename fields of NDJSON records
-- [transform-genbank-location](transform-genbank-location) - Parses `location` field with the expected pattern `"<country_value>[:<region>][, <locality>]"` based on [GenBank's country field](https://www.ncbi.nlm.nih.gov/genbank/collab/country/)
-- [transform-strain-names](transform-strain-names) - Ordered search for strain names across several fields.
-
 ## Software requirements
 
 Some scripts may require Bash ≥4. If you are running these scripts on macOS, the builtin Bash (`/bin/bash`) does not meet this requirement. You can install [Homebrew's Bash](https://formulae.brew.sh/formula/bash) which is more up to date.
@@ -134,7 +125,24 @@ Some scripts may require Bash ≥4. If you are running these scripts on macOS, t
 
 Most scripts are untested within this repo, relying on "testing in production". That is the only practical testing option for some scripts such as the ones interacting with S3 and Slack.
 
-For more locally testable scripts, Cram-style functional tests live in `tests` and are run as part of CI. To run these locally,
+## Working on this repo
+
+This repo is configured to use [pre-commit](https://pre-commit.com),
+to help automatically catch common coding errors and syntax issues
+with changes before they are committed to the repo.
+
+If you will be writing new code or otherwise working within this repo,
+please do the following to get started:
+
+1. [install `pre-commit`](https://pre-commit.com/#install) by running
+   either `python -m pip install pre-commit` or `brew install
+   pre-commit`, depending on your preferred package management
+   solution
+2. install the local git hooks by running `pre-commit install` from
+   the root of the repo
+3. when problems are detected, correct them in your local working tree
+   before committing them.
 
-1. Download Cram: `pip install cram`
-2. Run the tests: `cram tests/`
+Note that these pre-commit checks are also run in a GitHub Action when
+changes are pushed to GitHub, so correcting issues locally will
+prevent extra cycles of correction.