Skip to content

Commit

Permalink
Merge pull request #3 from nextstrain/snakeformat
Browse files Browse the repository at this point in the history
Run snakefmt on snakefiles
  • Loading branch information
joverlee521 authored Sep 18, 2023
2 parents c336044 + 1eaf21a commit 9a18402
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 31 deletions.
1 change: 1 addition & 0 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Use default configuration values. Override with Snakemake's --configfile/--config options.
configfile: "config/defaults.yaml"


include: "rules/fetch_from_ncbi.smk"
include: "rules/curate.smk"
32 changes: 18 additions & 14 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -6,31 +6,34 @@ from NCBI and outputs the clean data as two separate files:
- results/sequences.fasta
"""


# The following two rules can be ignored if you choose not to use the
# generalized geolocation rules that are shared across pathogens.
# The Nextstrain team will try to maintain a generalized set of geolocation
# rules that can then be overridden by local geolocation rules per pathogen repo.
rule fetch_general_geolocation_rules:
output:
general_geolocation_rules = "data/general-geolocation-rules.tsv"
general_geolocation_rules="data/general-geolocation-rules.tsv",
params:
geolocation_rules_url = config["curate"]["geolocation_rules_url"]
geolocation_rules_url=config["curate"]["geolocation_rules_url"],
shell:
"""
curl {params.geolocation_rules_url} > {output.general_geolocation_rules}
"""


rule concat_geolocation_rules:
input:
general_geolocation_rules = "data/general-geolocation-rules.tsv",
local_geolocation_rules = config["curate"]["local_geolocation_rules"]
general_geolocation_rules="data/general-geolocation-rules.tsv",
local_geolocation_rules=config["curate"]["local_geolocation_rules"],
output:
all_geolocation_rules = "data/all-geolocation-rules.tsv"
all_geolocation_rules="data/all-geolocation-rules.tsv",
shell:
"""
cat {input.general_geolocation_rules} {input.local_geolocation_rules} >> {output.all_geolocation_rules}
"""


# This curate pipeline is based on existing pipelines for pathogen repos using NCBI data.
# You may want to add and/or remove steps from the pipeline for custom metadata
# curation for your pathogen. Note that the curate pipeline is streaming NDJSON
Expand All @@ -40,15 +43,15 @@ rule concat_geolocation_rules:
# separate files: a metadata TSV and a sequences FASTA.
rule curate:
input:
sequences_ndjson = "data/ncbi.ndjson",
sequences_ndjson="data/ncbi.ndjson",
# Change the geolocation_rules input path if you are removing the above two rules
all_geolocation_rules = "data/all-geolocation-rules.tsv",
annotations = config["curate"]["annotations"]
all_geolocation_rules="data/all-geolocation-rules.tsv",
annotations=config["curate"]["annotations"],
output:
metadata = "results/all_metadata.tsv",
sequences = "results/sequences.fasta"
metadata="results/all_metadata.tsv",
sequences="results/sequences.fasta",
log:
"logs/curate.txt"
"logs/curate.txt",
benchmark:
"benchmarks/curate.txt"
params:
Expand Down Expand Up @@ -94,13 +97,14 @@ rule curate:
--output-seq-field {params.sequence_field} ) 2>> {log}
"""


rule subset_metadata:
input:
metadata="results/all_metadata.tsv"
metadata="results/all_metadata.tsv",
output:
subset_metadata="results/subset_metadata.tsv"
subset_metadata="results/subset_metadata.tsv",
params:
metadata_fields=config["curate"]["metadata_columns"]
metadata_fields=config["curate"]["metadata_columns"],
shell:
"""
tsv-select -H -f {params.metadata_fields} \
Expand Down
34 changes: 19 additions & 15 deletions ingest/rules/fetch_from_ncbi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@ to the other approaches.
########################## 1. Fetch from Entrez ###########################
###########################################################################


rule fetch_from_ncbi_entrez:
params:
term = config["entrez_search_term"]
term=config["entrez_search_term"],
output:
genbank = "data/genbank.gb"
genbank="data/genbank.gb",
# Allow retries in case of network errors
retries: 5
benchmark:
Expand All @@ -41,9 +42,9 @@ rule fetch_from_ncbi_entrez:

rule parse_genbank_to_ndjson:
input:
genbank = "data/genbank.gb"
genbank="data/genbank.gb",
output:
ndjson = "data/ncbi.ndjson"
ndjson="data/ncbi.ndjson",
benchmark:
"benchmarks/parse_genbank_to_ndjson.txt"
shell:
Expand All @@ -56,11 +57,12 @@ rule parse_genbank_to_ndjson:
####################### 2. Fetch from NCBI Datasets #######################
###########################################################################


rule fetch_ncbi_dataset_package:
params:
ncbi_taxon_id = config["ncbi_taxon_id"],
ncbi_taxon_id=config["ncbi_taxon_id"],
output:
dataset_package = temp("data/ncbi_dataset.zip")
dataset_package=temp("data/ncbi_dataset.zip"),
# Allow retries in case of network errors
retries: 5
benchmark:
Expand All @@ -75,9 +77,9 @@ rule fetch_ncbi_dataset_package:

rule extract_ncbi_dataset_sequences:
input:
dataset_package = "data/ncbi_dataset.zip"
dataset_package="data/ncbi_dataset.zip",
output:
ncbi_dataset_sequences = temp("data/ncbi_dataset_sequences.fasta")
ncbi_dataset_sequences=temp("data/ncbi_dataset_sequences.fasta"),
benchmark:
"benchmarks/extract_ncbi_dataset_sequences.txt"
shell:
Expand Down Expand Up @@ -122,11 +124,13 @@ def _get_ncbi_dataset_field_mnemonics(provided_fields: list) -> str:

rule format_ncbi_dataset_report:
input:
dataset_package = "data/ncbi_dataset.zip"
dataset_package="data/ncbi_dataset.zip",
output:
ncbi_dataset_tsv = temp("data/ncbi_dataset_report.tsv")
ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"),
params:
fields_to_include = _get_ncbi_dataset_field_mnemonics(config["ncbi_dataset_fields"])
fields_to_include=_get_ncbi_dataset_field_mnemonics(
config["ncbi_dataset_fields"]
),
benchmark:
"benchmarks/format_ncbi_dataset_report.txt"
shell:
Expand All @@ -144,12 +148,12 @@ rule format_ncbi_dataset_report:
# data that we host on data.nextstrain.org
rule format_ncbi_datasets_ndjson:
input:
ncbi_dataset_sequences = "data/ncbi_dataset_sequences.fasta",
ncbi_dataset_tsv = "data/ncbi_dataset_report.tsv",
ncbi_dataset_sequences="data/ncbi_dataset_sequences.fasta",
ncbi_dataset_tsv="data/ncbi_dataset_report.tsv",
output:
ndjson = "data/ncbi.ndjson",
ndjson="data/ncbi.ndjson",
log:
"logs/format_ncbi_datasets_ndjson.txt"
"logs/format_ncbi_datasets_ndjson.txt",
benchmark:
"benchmarks/format_ncbi_datasets_ndjson.txt"
shell:
Expand Down
4 changes: 3 additions & 1 deletion nextclade/Snakefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# Use default configuration values. Override with Snakemake's --configfile/--config options.
configfile: "config/defaults.yaml"


rule all:
input:
# Fill in path to the final exported Auspice JSON
auspice_json = ""
auspice_json="",


include: "rules/preprocess.smk"
include: "rules/prepare_sequences.smk"
Expand Down
4 changes: 3 additions & 1 deletion phylogenetic/Snakefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# Use default configuration values. Override with Snakemake's --configfile/--config options.
configfile: "config/defaults.yaml"


rule all:
input:
# Fill in path to the final exported Auspice JSON
auspice_json = ""
auspice_json="",


include: "rules/prepare_sequences.smk"
include: "rules/construct_phylogeny.smk"
Expand Down

0 comments on commit 9a18402

Please sign in to comment.