From a146a3e2f2f028e429c2c8fad2638b11976e8c21 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Sat, 28 Sep 2024 10:00:55 -0400 Subject: [PATCH] Drop serotype annotation from filenames --- ingest/Snakefile | 8 ++------ .../nextstrain-automation/config.yaml | 4 ++-- ingest/rules/curate.smk | 14 +++++++------- ingest/rules/fetch_from_ncbi.smk | 4 ++-- ingest/rules/nextclade.smk | 6 +++--- phylogenetic/build-configs/ci/config.yaml | 4 ++-- phylogenetic/defaults/config.yaml | 4 ++-- .../{metadata_all.tsv => metadata.tsv} | 0 .../{sequences_all.fasta => sequences.fasta} | 0 9 files changed, 20 insertions(+), 24 deletions(-) rename phylogenetic/example_data/{metadata_all.tsv => metadata.tsv} (100%) rename phylogenetic/example_data/{sequences_all.fasta => sequences.fasta} (100%) diff --git a/ingest/Snakefile b/ingest/Snakefile index 97d49b7..dd74e5b 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -4,16 +4,12 @@ workdir: workflow.current_basedir # Use default configuration values. Override with Snakemake's --configfile/--config options. configfile: "defaults/config.yaml" -serotypes = ["all"] -wildcard_constraints: - serotype = "|".join(serotypes) - # This is the default rule that Snakemake will run when there are no specified targets. # The default output of the ingest workflow is usually the curated metadata and sequences. rule all: input: - sequences=expand("results/sequences_{serotype}.fasta", serotype=serotypes), - metadata=expand("results/metadata_{serotype}.tsv", serotype=serotypes), + sequences="results/sequences.fasta", + metadata="results/metadata.tsv", # Include smk files that contain the core steps necessary for building the curated metadata and sequence files. # If there are build-specific customizations, they should be added with the diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml index 4533217..3520dac 100644 --- a/ingest/build-configs/nextstrain-automation/config.yaml +++ b/ingest/build-configs/nextstrain-automation/config.yaml @@ -16,5 +16,5 @@ s3_dst: "s3://nextstrain-data/files/workflows/wnv" # Mapping of files to upload files_to_upload: - metadata.tsv.zst: results/metadata_all.tsv - sequences.fasta.zst: results/sequences_all.fasta + metadata.tsv.zst: results/metadata.tsv + sequences.fasta.zst: results/sequences.fasta diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 3efb714..77dc91a 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -43,16 +43,16 @@ def format_field_map(field_map: dict[str, str]) -> str: rule curate: input: - sequences_ndjson="data/genbank_all.ndjson", + sequences_ndjson="data/genbank.ndjson", all_geolocation_rules="data/all-geolocation-rules.tsv", annotations=config["curate"]["annotations"], output: - metadata="data/raw_metadata_{serotype}_curated.tsv", - sequences="results/sequences_{serotype}.fasta", + metadata="data/raw_metadata_curated.tsv", + sequences="results/sequences.fasta", log: - "logs/curate_{serotype}.txt", + "logs/curate.txt", benchmark: - "benchmarks/curate_{serotype}.txt", + "benchmarks/curate.txt", params: field_map=format_field_map(config["curate"]["field_map"]), strain_regex=config["curate"]["strain_regex"], @@ -109,9 +109,9 @@ rule curate: rule subset_metadata: input: - metadata="data/raw_metadata_{serotype}_curated.tsv", + metadata="data/raw_metadata_curated.tsv", output: - metadata="data/raw_metadata_{serotype}.tsv", + metadata="data/raw_metadata.tsv", params: metadata_fields=",".join(config["curate"]["metadata_columns"]), shell: diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk index 6e188e5..29005d1 100644 --- a/ingest/rules/fetch_from_ncbi.smk +++ b/ingest/rules/fetch_from_ncbi.smk @@ -9,7 +9,7 @@ file must exist as a static file in the repo. Produces final output as - sequences_ndjson = "data/sequences_{serotype}.ndjson" + sequences_ndjson = "data/sequences.ndjson" """ workflow.global_resources.setdefault("concurrent_deploys", 2) @@ -88,7 +88,7 @@ rule format_ncbi_datasets_ndjson: ncbi_dataset_sequences = "data/ncbi_dataset_sequences.fasta", ncbi_dataset_tsv = "data/ncbi_dataset_report.tsv", output: - ndjson = "data/genbank_all.ndjson", + ndjson = "data/genbank.ndjson", log: "logs/format_ncbi_datasets_ndjson.txt" benchmark: diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 53d5348..acb6098 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -16,7 +16,7 @@ https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html rule nextclade_classify: #Classifies sequences into clades using Nextclade input: - sequences="results/sequences_all.fasta", + sequences="results/sequences.fasta", dataset=config["nextclade"]["nextclade_dataset_path"], output: nextclade_tsv="data/nextclade_results/nextclade.tsv", @@ -52,10 +52,10 @@ rule select_nextclade_columns: rule append_nextclade_columns: #Append the nextclade results to the metadata input: - metadata="data/raw_metadata_all.tsv", + metadata="data/raw_metadata.tsv", nextclade_subtypes="data/nextclade_clades.tsv", output: - metadata_all="results/metadata_all.tsv", + metadata_all="results/metadata.tsv", params: id_field=config["curate"]["output_id_field"], nextclade_field=config["nextclade"]["nextclade_field"], diff --git a/phylogenetic/build-configs/ci/config.yaml b/phylogenetic/build-configs/ci/config.yaml index 8db22cf..2533419 100644 --- a/phylogenetic/build-configs/ci/config.yaml +++ b/phylogenetic/build-configs/ci/config.yaml @@ -2,8 +2,8 @@ # for the CI workflow to run with the example data. # Pull in metadata and sequences from the example_data directory -input_metadata: "example_data/metadata_all.tsv" -input_sequences: "example_data/sequences_all.fasta" +input_metadata: "example_data/metadata.tsv" +input_sequences: "example_data/sequences.fasta" ## Custom rules to run as part of the CI automated workflow ## The paths should be relative to the phylogenetic directory. diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index 7538e93..e0aa780 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -6,8 +6,8 @@ reference: "defaults/reference.gb" root: "AF481864" # Pull in metadata and sequences from the ingest workflow -input_metadata: "../ingest/results/metadata_all.tsv" -input_sequences: "../ingest/results/sequences_all.fasta" +input_metadata: "../ingest/results/metadata.tsv" +input_sequences: "../ingest/results/sequences.fasta" #subsampling: #all: --min-length '9800' --query "country == 'USA' & accession != 'NC_009942'" diff --git a/phylogenetic/example_data/metadata_all.tsv b/phylogenetic/example_data/metadata.tsv similarity index 100% rename from phylogenetic/example_data/metadata_all.tsv rename to phylogenetic/example_data/metadata.tsv diff --git a/phylogenetic/example_data/sequences_all.fasta b/phylogenetic/example_data/sequences.fasta similarity index 100% rename from phylogenetic/example_data/sequences_all.fasta rename to phylogenetic/example_data/sequences.fasta