diff --git a/ingest/README.md b/ingest/README.md index b7116b7..ce5068d 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -7,6 +7,18 @@ If you have another data source or private data that needs to be formatted for the phylogenetic workflow, then you can use a similar workflow to curate your own data. +## Run + +From within the `ingest` directory, run the workflow with: + +``` +nextstrain build . +``` + +This produces a `results` directory with the following outputs: +- sequences.fasta +- metadata.tsv + ## Config The config directory contains all of the default configurations for the ingest workflow. diff --git a/ingest/Snakefile b/ingest/Snakefile index fb0a17a..121fe54 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -1,9 +1,31 @@ # Use default configuration values. Override with Snakemake's --configfile/--config options. configfile: "config/defaults.yaml" +rule all: + input: + "results/sequences.fasta", + "results/metadata.tsv", + include: "rules/fetch_from_ncbi.smk" include: "rules/curate.smk" + +# If included, the nextclade rules will create the final metadata TSV by +# joining the Nextclade output with the metadata. +# However, if not including nextclade, we have to rename the subset metadata TSV +# to the final metadata TSV. if "nextclade" in config: include: "rules/nextclade.smk" + +else: + + rule create_final_metadata: + input: + metadata="results/subset_metadata.tsv" + output: + metadata="results/metadata.tsv" + shell: + """ + mv {input.metadata} {output.metadata} + """