Skip to content

Commit

Permalink
Merge pull request #6 from nextstrain/ingest-curate-fixes
Browse files Browse the repository at this point in the history
Ingest curate fixes
  • Loading branch information
joverlee521 authored Oct 10, 2023
2 parents 9a18402 + 5e1b1ef commit 3493a93
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 4 deletions.
21 changes: 19 additions & 2 deletions ingest/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,27 @@ curate:
# The path to the local geolocation rules within the pathogen repo
# The path should be relative to the ingest directory.
local_geolocation_rules: "config/geolocation_rules.tsv"
# List of field names to change in the format of <old_field_name>=<new_field_name>
# List of field names to change where the key is the original field name and the value is the new field name
# This is the first step in the pipeline, so any references to field names
# in the configs below should use the new field names
field_map: []
# The examples below are based on the NCBI Datasets output TSV column names, your data might have different field names.
field_map:
Source database: database
Isolate Collection date: date
Release date: date_released
Update date: date_updated
Accession: accession
Isolate Lineage: strain
Geographic Region: region
Geographic Location: location
Submitter Names: authors
Submitter Affiliation: institution
SRA Accessions: sra_accessions
Length: length
Host Name: host
Isolate Lineage source: sample_type
BioSample accession: biosample_accession
Submitter Country: submitter_country
# List of date fields to standardize to ISO format YYYY-MM-DD
date_fields: []
# List of expected date formats that are present in the date fields provided above
Expand Down
11 changes: 9 additions & 2 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ rule concat_geolocation_rules:
"""


def format_field_map(field_map: dict[str, str]) -> str:
"""
Format dict to `"key1"="value1" "key2"="value2"...` for use in shell commands.
"""
return " ".join([f'"{key}"="{value}"' for key, value in field_map.items()])


# This curate pipeline is based on existing pipelines for pathogen repos using NCBI data.
# You may want to add and/or remove steps from the pipeline for custom metadata
# curation for your pathogen. Note that the curate pipeline is streaming NDJSON
Expand All @@ -55,7 +62,7 @@ rule curate:
benchmark:
"benchmarks/curate.txt"
params:
field_map=config["curate"]["field_map"],
field_map=format_field_map(config["curate"]["field_map"]),
date_fields=config["curate"]["date_fields"],
expected_date_formats=config["curate"]["expected_date_formats"],
articles=config["curate"]["titlecase"]["articles"],
Expand Down Expand Up @@ -104,7 +111,7 @@ rule subset_metadata:
output:
subset_metadata="results/subset_metadata.tsv",
params:
metadata_fields=config["curate"]["metadata_columns"],
metadata_fields=",".join(config["curate"]["metadata_columns"]),
shell:
"""
tsv-select -H -f {params.metadata_fields} \
Expand Down

0 comments on commit 3493a93

Please sign in to comment.