diff --git a/ingest/Snakefile b/ingest/Snakefile index c96655b..1be1d6f 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -1,5 +1,6 @@ # Use default configuration values. Override with Snakemake's --configfile/--config options. configfile: "config/defaults.yaml" + include: "rules/fetch_from_ncbi.smk" include: "rules/curate.smk" diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 4855c8f..26b736a 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -6,31 +6,34 @@ from NCBI and outputs the clean data as two separate files: - results/sequences.fasta """ + # The following two rules can be ignored if you choose not to use the # generalized geolocation rules that are shared across pathogens. # The Nextstrain team will try to maintain a generalized set of geolocation # rules that can then be overridden by local geolocation rules per pathogen repo. rule fetch_general_geolocation_rules: output: - general_geolocation_rules = "data/general-geolocation-rules.tsv" + general_geolocation_rules="data/general-geolocation-rules.tsv", params: - geolocation_rules_url = config["curate"]["geolocation_rules_url"] + geolocation_rules_url=config["curate"]["geolocation_rules_url"], shell: """ curl {params.geolocation_rules_url} > {output.general_geolocation_rules} """ + rule concat_geolocation_rules: input: - general_geolocation_rules = "data/general-geolocation-rules.tsv", - local_geolocation_rules = config["curate"]["local_geolocation_rules"] + general_geolocation_rules="data/general-geolocation-rules.tsv", + local_geolocation_rules=config["curate"]["local_geolocation_rules"], output: - all_geolocation_rules = "data/all-geolocation-rules.tsv" + all_geolocation_rules="data/all-geolocation-rules.tsv", shell: """ cat {input.general_geolocation_rules} {input.local_geolocation_rules} >> {output.all_geolocation_rules} """ + # This curate pipeline is based on existing pipelines for pathogen repos using NCBI data. # You may want to add and/or remove steps from the pipeline for custom metadata # curation for your pathogen. Note that the curate pipeline is streaming NDJSON @@ -40,15 +43,15 @@ rule concat_geolocation_rules: # separate files: a metadata TSV and a sequences FASTA. rule curate: input: - sequences_ndjson = "data/ncbi.ndjson", + sequences_ndjson="data/ncbi.ndjson", # Change the geolocation_rules input path if you are removing the above two rules - all_geolocation_rules = "data/all-geolocation-rules.tsv", - annotations = config["curate"]["annotations"] + all_geolocation_rules="data/all-geolocation-rules.tsv", + annotations=config["curate"]["annotations"], output: - metadata = "results/all_metadata.tsv", - sequences = "results/sequences.fasta" + metadata="results/all_metadata.tsv", + sequences="results/sequences.fasta", log: - "logs/curate.txt" + "logs/curate.txt", benchmark: "benchmarks/curate.txt" params: @@ -94,13 +97,14 @@ rule curate: --output-seq-field {params.sequence_field} ) 2>> {log} """ + rule subset_metadata: input: - metadata="results/all_metadata.tsv" + metadata="results/all_metadata.tsv", output: - subset_metadata="results/subset_metadata.tsv" + subset_metadata="results/subset_metadata.tsv", params: - metadata_fields=config["curate"]["metadata_columns"] + metadata_fields=config["curate"]["metadata_columns"], shell: """ tsv-select -H -f {params.metadata_fields} \ diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk index 9b0c847..3eec150 100644 --- a/ingest/rules/fetch_from_ncbi.smk +++ b/ingest/rules/fetch_from_ncbi.smk @@ -22,11 +22,12 @@ to the other approaches. ########################## 1. Fetch from Entrez ########################### ########################################################################### + rule fetch_from_ncbi_entrez: params: - term = config["entrez_search_term"] + term=config["entrez_search_term"], output: - genbank = "data/genbank.gb" + genbank="data/genbank.gb", # Allow retries in case of network errors retries: 5 benchmark: @@ -41,9 +42,9 @@ rule fetch_from_ncbi_entrez: rule parse_genbank_to_ndjson: input: - genbank = "data/genbank.gb" + genbank="data/genbank.gb", output: - ndjson = "data/ncbi.ndjson" + ndjson="data/ncbi.ndjson", benchmark: "benchmarks/parse_genbank_to_ndjson.txt" shell: @@ -56,11 +57,12 @@ rule parse_genbank_to_ndjson: ####################### 2. Fetch from NCBI Datasets ####################### ########################################################################### + rule fetch_ncbi_dataset_package: params: - ncbi_taxon_id = config["ncbi_taxon_id"], + ncbi_taxon_id=config["ncbi_taxon_id"], output: - dataset_package = temp("data/ncbi_dataset.zip") + dataset_package=temp("data/ncbi_dataset.zip"), # Allow retries in case of network errors retries: 5 benchmark: @@ -75,9 +77,9 @@ rule fetch_ncbi_dataset_package: rule extract_ncbi_dataset_sequences: input: - dataset_package = "data/ncbi_dataset.zip" + dataset_package="data/ncbi_dataset.zip", output: - ncbi_dataset_sequences = temp("data/ncbi_dataset_sequences.fasta") + ncbi_dataset_sequences=temp("data/ncbi_dataset_sequences.fasta"), benchmark: "benchmarks/extract_ncbi_dataset_sequences.txt" shell: @@ -122,11 +124,13 @@ def _get_ncbi_dataset_field_mnemonics(provided_fields: list) -> str: rule format_ncbi_dataset_report: input: - dataset_package = "data/ncbi_dataset.zip" + dataset_package="data/ncbi_dataset.zip", output: - ncbi_dataset_tsv = temp("data/ncbi_dataset_report.tsv") + ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"), params: - fields_to_include = _get_ncbi_dataset_field_mnemonics(config["ncbi_dataset_fields"]) + fields_to_include=_get_ncbi_dataset_field_mnemonics( + config["ncbi_dataset_fields"] + ), benchmark: "benchmarks/format_ncbi_dataset_report.txt" shell: @@ -144,12 +148,12 @@ rule format_ncbi_dataset_report: # data that we host on data.nextstrain.org rule format_ncbi_datasets_ndjson: input: - ncbi_dataset_sequences = "data/ncbi_dataset_sequences.fasta", - ncbi_dataset_tsv = "data/ncbi_dataset_report.tsv", + ncbi_dataset_sequences="data/ncbi_dataset_sequences.fasta", + ncbi_dataset_tsv="data/ncbi_dataset_report.tsv", output: - ndjson = "data/ncbi.ndjson", + ndjson="data/ncbi.ndjson", log: - "logs/format_ncbi_datasets_ndjson.txt" + "logs/format_ncbi_datasets_ndjson.txt", benchmark: "benchmarks/format_ncbi_datasets_ndjson.txt" shell: diff --git a/nextclade/Snakefile b/nextclade/Snakefile index 4f8fe84..f87d384 100644 --- a/nextclade/Snakefile +++ b/nextclade/Snakefile @@ -1,10 +1,12 @@ # Use default configuration values. Override with Snakemake's --configfile/--config options. configfile: "config/defaults.yaml" + rule all: input: # Fill in path to the final exported Auspice JSON - auspice_json = "" + auspice_json="", + include: "rules/preprocess.smk" include: "rules/prepare_sequences.smk" diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 93057b5..5145372 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -1,10 +1,12 @@ # Use default configuration values. Override with Snakemake's --configfile/--config options. configfile: "config/defaults.yaml" + rule all: input: # Fill in path to the final exported Auspice JSON - auspice_json = "" + auspice_json="", + include: "rules/prepare_sequences.smk" include: "rules/construct_phylogeny.smk"