From 9fe584802b7a10747843de956884ee1f04c9e6e8 Mon Sep 17 00:00:00 2001 From: Jover Date: Wed, 4 Oct 2023 14:32:39 -0700 Subject: [PATCH 1/2] ingest/curate.smk: Fix metadata_fields param MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Found that the metadata_fields param does not work as-is when walking through the template with @kistlerk. Copied fix from @corneliusroemer's nipah repo.¹ ¹ https://github.com/corneliusroemer/nipah/blob/73f2d432b517e32132a8fd597d7d96fa6fbdaf94/ingest/rules/curate.smk#L114 --- ingest/rules/curate.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 26b736a..1a9fa39 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -104,7 +104,7 @@ rule subset_metadata: output: subset_metadata="results/subset_metadata.tsv", params: - metadata_fields=config["curate"]["metadata_columns"], + metadata_fields=",".join(config["curate"]["metadata_columns"]), shell: """ tsv-select -H -f {params.metadata_fields} \ From 5e1b1efd30fc9875980fea4bb956877fec12af5c Mon Sep 17 00:00:00 2001 From: Jover Date: Wed, 4 Oct 2023 14:35:21 -0700 Subject: [PATCH 2/2] ingest/curate.smk: Make the field map config more user friendly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When walking through the template with @kistlek, the field map config param was a bit of a pain due to the spaces in the field names from NCBI Datasets. I saw that @corneliusroemer made some improvements in his nipah repo¹ to make this more user friendly so incorporating those changes here. ¹ https://github.com/corneliusroemer/nipah/commit/16a0f94e3da9fc023a6a80bf0b396657c5fd428c Co-authored-by: Cornelius Roemer --- ingest/config/defaults.yaml | 21 +++++++++++++++++++-- ingest/rules/curate.smk | 9 ++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml index 3bd8fd8..17fb708 100644 --- a/ingest/config/defaults.yaml +++ b/ingest/config/defaults.yaml @@ -22,10 +22,27 @@ curate: # The path to the local geolocation rules within the pathogen repo # The path should be relative to the ingest directory. local_geolocation_rules: "config/geolocation_rules.tsv" - # List of field names to change in the format of = + # List of field names to change where the key is the original field name and the value is the new field name # This is the first step in the pipeline, so any references to field names # in the configs below should use the new field names - field_map: [] + # The examples below are based on the NCBI Datasets output TSV column names, your data might have different field names. + field_map: + Source database: database + Isolate Collection date: date + Release date: date_released + Update date: date_updated + Accession: accession + Isolate Lineage: strain + Geographic Region: region + Geographic Location: location + Submitter Names: authors + Submitter Affiliation: institution + SRA Accessions: sra_accessions + Length: length + Host Name: host + Isolate Lineage source: sample_type + BioSample accession: biosample_accession + Submitter Country: submitter_country # List of date fields to standardize to ISO format YYYY-MM-DD date_fields: [] # List of expected date formats that are present in the date fields provided above diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 1a9fa39..5637a81 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -34,6 +34,13 @@ rule concat_geolocation_rules: """ +def format_field_map(field_map: dict[str, str]) -> str: + """ + Format dict to `"key1"="value1" "key2"="value2"...` for use in shell commands. + """ + return " ".join([f'"{key}"="{value}"' for key, value in field_map.items()]) + + # This curate pipeline is based on existing pipelines for pathogen repos using NCBI data. # You may want to add and/or remove steps from the pipeline for custom metadata # curation for your pathogen. Note that the curate pipeline is streaming NDJSON @@ -55,7 +62,7 @@ rule curate: benchmark: "benchmarks/curate.txt" params: - field_map=config["curate"]["field_map"], + field_map=format_field_map(config["curate"]["field_map"]), date_fields=config["curate"]["date_fields"], expected_date_formats=config["curate"]["expected_date_formats"], articles=config["curate"]["titlecase"]["articles"],