Skip to content

Commit

Permalink
Drop serotype annotation from filenames
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 committed Sep 28, 2024
1 parent b27d088 commit a146a3e
Show file tree
Hide file tree
Showing 9 changed files with 20 additions and 24 deletions.
8 changes: 2 additions & 6 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,12 @@ workdir: workflow.current_basedir
# Use default configuration values. Override with Snakemake's --configfile/--config options.
configfile: "defaults/config.yaml"

serotypes = ["all"]
wildcard_constraints:
serotype = "|".join(serotypes)

# This is the default rule that Snakemake will run when there are no specified targets.
# The default output of the ingest workflow is usually the curated metadata and sequences.
rule all:
input:
sequences=expand("results/sequences_{serotype}.fasta", serotype=serotypes),
metadata=expand("results/metadata_{serotype}.tsv", serotype=serotypes),
sequences="results/sequences.fasta",
metadata="results/metadata.tsv",

# Include smk files that contain the core steps necessary for building the curated metadata and sequence files.
# If there are build-specific customizations, they should be added with the
Expand Down
4 changes: 2 additions & 2 deletions ingest/build-configs/nextstrain-automation/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ s3_dst: "s3://nextstrain-data/files/workflows/wnv"

# Mapping of files to upload
files_to_upload:
metadata.tsv.zst: results/metadata_all.tsv
sequences.fasta.zst: results/sequences_all.fasta
metadata.tsv.zst: results/metadata.tsv
sequences.fasta.zst: results/sequences.fasta
14 changes: 7 additions & 7 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,16 @@ def format_field_map(field_map: dict[str, str]) -> str:

rule curate:
input:
sequences_ndjson="data/genbank_all.ndjson",
sequences_ndjson="data/genbank.ndjson",
all_geolocation_rules="data/all-geolocation-rules.tsv",
annotations=config["curate"]["annotations"],
output:
metadata="data/raw_metadata_{serotype}_curated.tsv",
sequences="results/sequences_{serotype}.fasta",
metadata="data/raw_metadata_curated.tsv",
sequences="results/sequences.fasta",
log:
"logs/curate_{serotype}.txt",
"logs/curate.txt",
benchmark:
"benchmarks/curate_{serotype}.txt",
"benchmarks/curate.txt",
params:
field_map=format_field_map(config["curate"]["field_map"]),
strain_regex=config["curate"]["strain_regex"],
Expand Down Expand Up @@ -109,9 +109,9 @@ rule curate:

rule subset_metadata:
input:
metadata="data/raw_metadata_{serotype}_curated.tsv",
metadata="data/raw_metadata_curated.tsv",
output:
metadata="data/raw_metadata_{serotype}.tsv",
metadata="data/raw_metadata.tsv",
params:
metadata_fields=",".join(config["curate"]["metadata_columns"]),
shell:
Expand Down
4 changes: 2 additions & 2 deletions ingest/rules/fetch_from_ncbi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ file must exist as a static file in the repo.
Produces final output as
sequences_ndjson = "data/sequences_{serotype}.ndjson"
sequences_ndjson = "data/sequences.ndjson"
"""
workflow.global_resources.setdefault("concurrent_deploys", 2)
Expand Down Expand Up @@ -88,7 +88,7 @@ rule format_ncbi_datasets_ndjson:
ncbi_dataset_sequences = "data/ncbi_dataset_sequences.fasta",
ncbi_dataset_tsv = "data/ncbi_dataset_report.tsv",
output:
ndjson = "data/genbank_all.ndjson",
ndjson = "data/genbank.ndjson",
log:
"logs/format_ncbi_datasets_ndjson.txt"
benchmark:
Expand Down
6 changes: 3 additions & 3 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html
rule nextclade_classify:
#Classifies sequences into clades using Nextclade
input:
sequences="results/sequences_all.fasta",
sequences="results/sequences.fasta",
dataset=config["nextclade"]["nextclade_dataset_path"],
output:
nextclade_tsv="data/nextclade_results/nextclade.tsv",
Expand Down Expand Up @@ -52,10 +52,10 @@ rule select_nextclade_columns:
rule append_nextclade_columns:
#Append the nextclade results to the metadata
input:
metadata="data/raw_metadata_all.tsv",
metadata="data/raw_metadata.tsv",
nextclade_subtypes="data/nextclade_clades.tsv",
output:
metadata_all="results/metadata_all.tsv",
metadata_all="results/metadata.tsv",
params:
id_field=config["curate"]["output_id_field"],
nextclade_field=config["nextclade"]["nextclade_field"],
Expand Down
4 changes: 2 additions & 2 deletions phylogenetic/build-configs/ci/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
# for the CI workflow to run with the example data.

# Pull in metadata and sequences from the example_data directory
input_metadata: "example_data/metadata_all.tsv"
input_sequences: "example_data/sequences_all.fasta"
input_metadata: "example_data/metadata.tsv"
input_sequences: "example_data/sequences.fasta"

## Custom rules to run as part of the CI automated workflow
## The paths should be relative to the phylogenetic directory.
Expand Down
4 changes: 2 additions & 2 deletions phylogenetic/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ reference: "defaults/reference.gb"
root: "AF481864"

# Pull in metadata and sequences from the ingest workflow
input_metadata: "../ingest/results/metadata_all.tsv"
input_sequences: "../ingest/results/sequences_all.fasta"
input_metadata: "../ingest/results/metadata.tsv"
input_sequences: "../ingest/results/sequences.fasta"

#subsampling:
#all: --min-length '9800' --query "country == 'USA' & accession != 'NC_009942'"
Expand Down
File renamed without changes.
File renamed without changes.

0 comments on commit a146a3e

Please sign in to comment.