Drop serotype annotation from filenames

nextstrain · Sep 28, 2024 · a146a3e · a146a3e
1 parent b27d088
commit a146a3e
Show file tree

Hide file tree

Showing 9 changed files with 20 additions and 24 deletions.
diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -4,16 +4,12 @@ workdir: workflow.current_basedir
 # Use default configuration values. Override with Snakemake's --configfile/--config options.
 configfile: "defaults/config.yaml"
 
-serotypes = ["all"]
-wildcard_constraints:
-    serotype = "|".join(serotypes)
-
 # This is the default rule that Snakemake will run when there are no specified targets.
 # The default output of the ingest workflow is usually the curated metadata and sequences.
 rule all:
     input:
-        sequences=expand("results/sequences_{serotype}.fasta", serotype=serotypes),
-        metadata=expand("results/metadata_{serotype}.tsv", serotype=serotypes),
+        sequences="results/sequences.fasta",
+        metadata="results/metadata.tsv",
 
 # Include smk files that contain the core steps necessary for building the curated metadata and sequence files.
 # If there are build-specific customizations, they should be added with the

diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml
@@ -16,5 +16,5 @@ s3_dst: "s3://nextstrain-data/files/workflows/wnv"
 
 # Mapping of files to upload
 files_to_upload:
-  metadata.tsv.zst: results/metadata_all.tsv
-  sequences.fasta.zst: results/sequences_all.fasta
+  metadata.tsv.zst: results/metadata.tsv
+  sequences.fasta.zst: results/sequences.fasta
diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
@@ -43,16 +43,16 @@ def format_field_map(field_map: dict[str, str]) -> str:
 
 rule curate:
     input:
-        sequences_ndjson="data/genbank_all.ndjson",
+        sequences_ndjson="data/genbank.ndjson",
         all_geolocation_rules="data/all-geolocation-rules.tsv",
         annotations=config["curate"]["annotations"],
     output:
-        metadata="data/raw_metadata_{serotype}_curated.tsv",
-        sequences="results/sequences_{serotype}.fasta",
+        metadata="data/raw_metadata_curated.tsv",
+        sequences="results/sequences.fasta",
     log:
-        "logs/curate_{serotype}.txt",
+        "logs/curate.txt",
     benchmark:
-        "benchmarks/curate_{serotype}.txt",
+        "benchmarks/curate.txt",
     params:
         field_map=format_field_map(config["curate"]["field_map"]),
         strain_regex=config["curate"]["strain_regex"],
@@ -109,9 +109,9 @@ rule curate:
 
 rule subset_metadata:
     input:
-        metadata="data/raw_metadata_{serotype}_curated.tsv",
+        metadata="data/raw_metadata_curated.tsv",
     output:
-        metadata="data/raw_metadata_{serotype}.tsv",
+        metadata="data/raw_metadata.tsv",
     params:
         metadata_fields=",".join(config["curate"]["metadata_columns"]),
     shell:

diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk
@@ -9,7 +9,7 @@ file must exist as a static file in the repo.
 
 Produces final output as
 
-    sequences_ndjson = "data/sequences_{serotype}.ndjson"
+    sequences_ndjson = "data/sequences.ndjson"
 
 """
 workflow.global_resources.setdefault("concurrent_deploys", 2)
@@ -88,7 +88,7 @@ rule format_ncbi_datasets_ndjson:
         ncbi_dataset_sequences = "data/ncbi_dataset_sequences.fasta",
         ncbi_dataset_tsv = "data/ncbi_dataset_report.tsv",
     output:
-        ndjson = "data/genbank_all.ndjson",
+        ndjson = "data/genbank.ndjson",
     log:
         "logs/format_ncbi_datasets_ndjson.txt"
     benchmark:

diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
@@ -16,7 +16,7 @@ https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html
 rule nextclade_classify:
     #Classifies sequences into clades using Nextclade
     input:
-        sequences="results/sequences_all.fasta",
+        sequences="results/sequences.fasta",
         dataset=config["nextclade"]["nextclade_dataset_path"],
     output:
         nextclade_tsv="data/nextclade_results/nextclade.tsv",
@@ -52,10 +52,10 @@ rule select_nextclade_columns:
 rule append_nextclade_columns:
     #Append the nextclade results to the metadata
     input:
-        metadata="data/raw_metadata_all.tsv",
+        metadata="data/raw_metadata.tsv",
         nextclade_subtypes="data/nextclade_clades.tsv",
     output:
-        metadata_all="results/metadata_all.tsv",
+        metadata_all="results/metadata.tsv",
     params:
         id_field=config["curate"]["output_id_field"],
         nextclade_field=config["nextclade"]["nextclade_field"],

diff --git a/phylogenetic/build-configs/ci/config.yaml b/phylogenetic/build-configs/ci/config.yaml
@@ -2,8 +2,8 @@
 # for the CI workflow to run with the example data.
 
 # Pull in metadata and sequences from the example_data directory
-input_metadata: "example_data/metadata_all.tsv"
-input_sequences: "example_data/sequences_all.fasta"
+input_metadata: "example_data/metadata.tsv"
+input_sequences: "example_data/sequences.fasta"
 
 ## Custom rules to run as part of the CI automated workflow
 ## The paths should be relative to the phylogenetic directory.

diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml
@@ -6,8 +6,8 @@ reference: "defaults/reference.gb"
 root: "AF481864"
 
 # Pull in metadata and sequences from the ingest workflow
-input_metadata: "../ingest/results/metadata_all.tsv"
-input_sequences: "../ingest/results/sequences_all.fasta"
+input_metadata: "../ingest/results/metadata.tsv"
+input_sequences: "../ingest/results/sequences.fasta"
 
 #subsampling:
   #all: --min-length '9800' --query "country == 'USA' & accession != 'NC_009942'"

diff --git a/phylogenetic/example_data/metadata_all.tsv → phylogenetic/example_data/metadata.tsv b/phylogenetic/example_data/metadata_all.tsv → phylogenetic/example_data/metadata.tsv
diff --git a/...ogenetic/example_data/sequences_all.fasta → phylogenetic/example_data/sequences.fasta b/...ogenetic/example_data/sequences_all.fasta → phylogenetic/example_data/sequences.fasta