diff --git a/ingest/bin/join-metadata-and-clades.py b/ingest/bin/join-metadata-and-clades.py deleted file mode 100755 index 3a0e919e..00000000 --- a/ingest/bin/join-metadata-and-clades.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import re -import sys -import pandas as pd - -NEXTCLADE_JOIN_COLUMN_NAME = 'seqName' -VALUE_MISSING_DATA = '?' - -column_map = { - "clade": "clade", - "outbreak": "outbreak", - "lineage": "lineage", - "coverage": "coverage", - "totalMissing": "missing_data", - "totalSubstitutions": "divergence", - "totalNonACGTNs": "nonACGTN", - "qc.missingData.status": "QC_missing_data", - "qc.mixedSites.status": "QC_mixed_sites", - "qc.privateMutations.status": "QC_rare_mutations", - "qc.frameShifts.status": "QC_frame_shifts", - "qc.stopCodons.status": "QC_stop_codons", - "frameShifts": "frame_shifts", - "isReverseComplement": "is_reverse_complement", -# "deletions": "deletions", -# "insertions": "insertions" -# "substitutions": "substitutions", -# "aaSubstitutions": "aaSubstitutions" -} - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Joins metadata file with Nextclade clade output", - ) - parser.add_argument("--metadata") - parser.add_argument("--nextclade") - parser.add_argument("--id-field") - parser.add_argument("-o", default=sys.stdout) - return parser.parse_args() - -def main(): - args = parse_args() - - metadata = pd.read_csv(args.metadata, index_col=args.id_field, - sep='\t', low_memory=False, na_filter = False) - - # Read and rename clade column to be more descriptive - clades = pd.read_csv(args.nextclade, index_col=NEXTCLADE_JOIN_COLUMN_NAME, - sep='\t', low_memory=False, na_filter = False) \ - .rename(columns=column_map) - - clades.index = clades.index.map(lambda x: re.sub(" \|.*", "", x)) - - # Select columns in column map - clades = clades[list(column_map.values())] - - # Separate long from short columns - short_metadata = metadata.iloc[:,:-2].copy() - long_metadata = metadata.iloc[:,-2:].copy() - - # Concatenate on columns - result = pd.merge( - short_metadata, clades, - left_index=True, - right_index=True, - how='left' - ) - - # Add long columns to back - result = pd.concat([result, long_metadata], axis=1) - - result.to_csv(args.o, index_label=args.id_field, sep='\t') - - -if __name__ == '__main__': - main() diff --git a/ingest/source-data/nextclade-field-map.tsv b/ingest/source-data/nextclade-field-map.tsv new file mode 100644 index 00000000..41894fe9 --- /dev/null +++ b/ingest/source-data/nextclade-field-map.tsv @@ -0,0 +1,17 @@ +key value +index index +seqName seqName +clade clade +outbreak outbreak +lineage lineage +coverage coverage +totalMissing missing_data +totalSubstitutions divergence +totalNonACGTNs nonACGTN +qc.missingData.status QC_missing_data +qc.mixedSites.status QC_mixed_sites +qc.privateMutations.status QC_rare_mutations +qc.frameShifts.status QC_frame_shifts +qc.stopCodons.status QC_stop_codons +frameShifts frame_shifts +isReverseComplement is_reverse_complement \ No newline at end of file diff --git a/ingest/workflow/snakemake_rules/nextclade.smk b/ingest/workflow/snakemake_rules/nextclade.smk index 28da0a98..ceed8d68 100644 --- a/ingest/workflow/snakemake_rules/nextclade.smk +++ b/ingest/workflow/snakemake_rules/nextclade.smk @@ -56,15 +56,31 @@ rule join_metadata_clades: input: nextclade="data/nextclade.tsv", metadata="data/metadata_raw.tsv", + nextclade_field_map="source-data/nextclade-field-map.tsv", output: - "data/metadata.tsv", + metadata="data/metadata.tsv", params: id_field=config["transform"]["id_field"], shell: """ - python3 bin/join-metadata-and-clades.py \ - --id-field {params.id_field} \ - --metadata {input.metadata} \ - --nextclade {input.nextclade} \ - -o {output} + csvtk -tl rename2 \ + -F \ + -f '*' \ + -p '(.+)' \ + -r '{{kv}}' \ + -k {input.nextclade_field_map} \ + {input.nextclade} \ + > results/nextclade_renamed.tsv + + export APPEND_FIELDS=`awk 'NR>1 {{print $2}}' {input.nextclade_field_map} | grep -v -e "index" -e "seqName" | tr '\n' ',' | sed 's/,\$//g'` + + tsv-join -H \ + --filter-file results/nextclade_renamed.tsv \ + --key-fields seqName \ + --data-fields {params.id_field} \ + --append-fields $APPEND_FIELDS \ + --allow-duplicate-keys \ + --write-all ? \ + {input.metadata} \ + > {output.metadata} """