Skip to content

Commit

Permalink
refactor: Renamed ref-genome in rules (#106)
Browse files Browse the repository at this point in the history
* Soft-coded the ref-genome DB

* Including different databases for classification

* Changes in Qiime env

* fmt

* fmt

* fmt

* fmt

* fmt

* fmt

* fmt test

* fmt

* fmt

* Separating DB download

* fmt

* fmt

* Ref-DB calling changed

---------

Co-authored-by: Ann-Kathrin Brüggemann <[email protected]>
Co-authored-by: AKBrueggemann <[email protected]>
  • Loading branch information
3 people authored Feb 16, 2024
1 parent 1a15959 commit 0ffdb2a
Show file tree
Hide file tree
Showing 14 changed files with 115 additions and 62 deletions.
14 changes: 7 additions & 7 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,14 @@ jobs:
run: |
cd .tests/resources
ls -la
wget https://data.qiime2.org/2022.2/common/silva-138-99-seqs.qza
wget https://data.qiime2.org/2022.2/common/silva-138-99-tax.qza
wget --no-check-certificate https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz
gzip -d GRCh38_latest_genomic.fna.gz
mkdir minikraken2_v2_8GB_201904_UPDATE/
tar -xf small_db.tgz --directory minikraken2_v2_8GB_201904_UPDATE/
wget -O ref-seqs.qza https://data.qiime2.org/2022.2/common/silva-138-99-seqs.qza
wget -O ref-taxa.qza https://data.qiime2.org/2022.2/common/silva-138-99-tax.qza
wget --no-check-certificate -O ref-genome.fna.gz https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz
gzip -d ref-genome.fna.gz
mkdir filtering-database/
tar -xf small_db.tgz --directory filtering-database/
rm small_db.tgz
cd minikraken2_v2_8GB_201904_UPDATE
cd filtering-database
mv small_db/* .
rm -r small_db
cd ../..
Expand Down
6 changes: 6 additions & 0 deletions .tests/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,16 @@ metadata: config/pep/metadata.txt
remove-columns: ["site_name"]
# Paths to the databases used for classification and taxonomy
database:
Silva: True
download-path-seq: resources/silva-138-99-seqs.qza
download-path-tax: resources/silva-138-99-tax.qza
kraken-db: ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/old/minikraken2_v2_8GB_201904.tgz
ref-genome: https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz
Greengenes: False
gg2-seq: http://ftp.microbio.me/greengenes_release/current/2022.10.backbone.full-length.fna.qza
gg2-tax: http://ftp.microbio.me/greengenes_release/current/2022.10.backbone.tax.qza
NCBI: False
NCBI-query: 33175[BioProject]
# Forward and reverse adapters used for sequencing
adapter1: TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG
adapter2: GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG
Expand Down
6 changes: 6 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,16 @@ metadata: config/pep/metadata.txt
remove-columns: ["site_name"]
# Paths to the databases used for classification and taxonomy
database:
Silva: True
download-path-seq: https://data.qiime2.org/2022.2/common/silva-138-99-seqs.qza
download-path-tax: https://data.qiime2.org/2022.2/common/silva-138-99-tax.qza
kraken-db: ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/old/minikraken2_v2_8GB_201904.tgz
ref-genome: https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz
Greengenes: False
gg2-seq: http://ftp.microbio.me/greengenes_release/current/2022.10.backbone.full-length.fna.qza
gg2-tax: http://ftp.microbio.me/greengenes_release/current/2022.10.backbone.tax.qza
NCBI: False
NCBI-query: 33175[BioProject]
# Forward and reverse adapters used for sequencing
adapter1: TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG
adapter2: GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG
Expand Down
4 changes: 2 additions & 2 deletions workflow/envs/qiime-only-env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,6 @@ dependencies:
- zstandard=0.19.0
- zstd=1.5.2
- pip:
#- git+https://github.com/bokulich-lab/RESCRIPt.git
- git+https://github.com/bokulich-lab/RESCRIPt.git
- git+https://github.com/yxia0125/q2-repeat-rarefy.git
#- git+https://github.com/bokulich-lab/q2-types-genomics.git
- git+https://github.com/bokulich-lab/q2-types-genomics.git
3 changes: 1 addition & 2 deletions workflow/rules/bowtie.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@ if config["bowtie"] == True:

rule create_bowtie_db:
input:
"resources/GRCh38_latest_genomic_upper.fna",
"resources/ref-genome_upper.fna",
output:
#files="resources/bowtie_host_DB",
dirc=directory("resources/bowtie_DB/"),
params:
filename="bowtie_host_DB",
Expand Down
4 changes: 2 additions & 2 deletions workflow/rules/chimera_checking.smk
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ rule chimera_only:
rule chimera_taxonomy:
input:
query="results/{date}/out/rep-seqs-chimeras.qza",
reference_reads="resources/silva-138-99-seqs.qza",
reference_taxonomy="resources/silva-138-99-tax.qza",
reference_reads="resources/ref-seqs.qza",
reference_taxonomy="resources/ref-taxa.qza",
output:
"results/{date}/out/chimera_taxonomy.qza",
params:
Expand Down
4 changes: 2 additions & 2 deletions workflow/rules/classification.smk
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ if config["DADA2"] == False:
rule classification:
input:
query="results/{date}/out/seq-cluster-filtered.qza",
reference_reads="resources/silva-138-99-seqs.qza",
reference_taxonomy="resources/silva-138-99-tax.qza",
reference_reads="resources/ref-seqs.qza",
reference_taxonomy="resources/ref-taxa.qza",
output:
tax="results/{date}/out/taxonomy.qza",
search="results/{date}/out/blast-search-results.qza",
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/filtering.smk
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ if config["DADA2"] == False:
input:
seq="results/{date}/out/derepl-seq.qza",
table="results/{date}/out/derepl-table.qza",
ref_seq="resources/GRCh38_latest_genomic_upper.qza",
ref_seq="resources/ref-genome_upper.qza",
output:
seq="results/{date}/out/derep-seq-nonhum.qza",
table="results/{date}/out/derep-table-nonhum.qza",
Expand Down
21 changes: 21 additions & 0 deletions workflow/rules/get_Greengenes_DB.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
if (
config["database"]["Greengenes"] == True,
config["database"]["Silva"] == False,
config["database"]["NCBI"] == False,
):

rule get_greengenes:
output:
seq="resources/ref-seqs.qza",
tax="resources/ref-taxa.qza",
params:
seq=str(config["database"]["gg2-seq"]),
tax=str(config["database"]["gg2-tax"]),
log:
"logs/prep_Greengenes.log",
conda:
"../envs/python.yaml"
shell:
"cd resources; "
"wget -O ref-seqs.qza {params.seq}; "
"wget -O ref-taxa.qza {params.tax}; "
22 changes: 22 additions & 0 deletions workflow/rules/get_NCBI_DB.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
if (
config["database"]["NCBI"] == True,
config["database"]["Silva"] == False,
config["database"]["Greengenes"] == False,
):

rule get_NCBI_ref:
output:
seq="resources/ref-seqs.qza",
tax="resources/ref-taxa.qza",
params:
query=config["database"]["NCBI-query"],
log:
"logs/prep_NCBI.log",
conda:
"../envs/qiime-only-env.yaml"
shell:
"qiime rescript get-ncbi-data "
"--p-query {params.query} "
"--o-sequences {output.seq} "
"--o-taxonomy {output.tax} "
"--verbose 2> {log} "
21 changes: 21 additions & 0 deletions workflow/rules/get_SILVA_DB.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
if (
config["database"]["Silva"] == True,
config["database"]["Greengenes"] == False,
config["database"]["NCBI"] == False,
):

rule get_SILVA:
output:
seq="resources/ref-seqs.qza",
tax="resources/ref-taxa.qza",
params:
seq=str(config["database"]["download-path-seq"]),
tax=str(config["database"]["download-path-tax"]),
log:
"logs/prep_SILVA.log",
conda:
"../envs/python.yaml"
shell:
"cd resources; "
"wget -O ref-seqs.qza {params.seq}; "
"wget -O ref-taxa.qza {params.tax}; "
62 changes: 20 additions & 42 deletions workflow/rules/preprocessing.smk
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
rule get_database:
output:
seq="resources/silva-138-99-seqs.qza",
tax="resources/silva-138-99-tax.qza",
genomic="resources/GRCh38_latest_genomic.fna.gz",
kraken="resources/minikraken2_v2_8GB_201904.tgz",
genomic=temp("resources/ref-genome.fna.gz"),
kraken=temp("resources/filtering-database.tgz"),
params:
seq=str(config["database"]["download-path-seq"]),
tax=str(config["database"]["download-path-tax"]),
genomic=str(config["database"]["ref-genome"]),
kraken=str(config["database"]["kraken-db"]),
log:
Expand All @@ -15,31 +11,8 @@ rule get_database:
"../envs/python.yaml"
shell:
"cd resources; "
"wget {params.genomic}; "
"wget {params.kraken}; "
"wget {params.seq}; "
"wget {params.tax}; "


# rule get_SILVA:
# output:
# seq_rna=temp("resources/silva-138.1-ssu-nr99-rna-seqs.qza"),
# tax=temp("resources/silva-138-99-tax.qza"),
# params:
# version="138.1",
# target="SSURef_NR99",
# log:
# "logs/prerp_SILVA.log",
# conda:
# "../envs/qiime-only-env.yaml"
# shell:
# "qiime rescript get-silva-data "
# "--p-version {params.version} "
# "--p-target {params.target} "
# "--p-include-species-labels "
# "--o-silva-sequences {output.seq_rna} "
# "--o-silva-taxonomy {output.tax} "
# "2> {log}"
"wget -O ref-genome.fna.gz {params.genomic}; "
"wget -O filtering-database.tgz {params.kraken}; "


# rule rna_to_dna_SILVA:
Expand All @@ -56,13 +29,11 @@ rule get_database:
# "--i-rna-sequences {input} "
# "--o-dna-sequences {output} "
# "2> {log}"


rule unzip_ref_gen:
input:
"resources/GRCh38_latest_genomic.fna.gz",
"resources/ref-genome.fna.gz",
output:
temp("resources/GRCh38_latest_genomic.fna"),
temp("resources/ref-genome.fna"),
log:
"logs/unzip_ref_gen.log",
conda:
Expand All @@ -73,9 +44,9 @@ rule unzip_ref_gen:

rule lower_to_upper:
input:
"resources/GRCh38_latest_genomic.fna",
"resources/ref-genome.fna",
output:
temp("resources/GRCh38_latest_genomic_upper.fna"),
temp("resources/ref-genome_upper.fna"),
log:
"logs/lower_to_upper_fasta.log",
conda:
Expand All @@ -86,9 +57,9 @@ rule lower_to_upper:

rule import_ref_genome:
input:
"resources/GRCh38_latest_genomic_upper.fna",
"resources/ref-genome_upper.fna",
output:
temp("resources/GRCh38_latest_genomic_upper.qza"),
temp("resources/ref-genome_upper.qza"),
log:
"logs/import_ref_gen.log",
conda:
Expand All @@ -103,15 +74,22 @@ rule import_ref_genome:

rule unzip_kraken:
input:
"resources/minikraken2_v2_8GB_201904.tgz",
"resources/filtering-database.tgz",
output:
temp(directory("resources/minikraken2_v2_8GB_201904_UPDATE")),
temp(directory("resources/filtering-database")),
log:
"logs/unzip_kraken_db.log",
conda:
"../envs/python.yaml"
shell:
"tar -zxvf {input} -C resources;"
"cd resources; "
"mkdir filtering-database; "
"cd ..;"
"tar -zxvf {input} --directory resources/filtering-database;"
"cd resources/filtering-database;"
"cp */* .;"
"cd ..;"
"rm -rf filtering-database/*/;"


if config["bowtie"] == False:
Expand Down
4 changes: 2 additions & 2 deletions workflow/rules/qualitycontroll.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ if config["datatype"] == "SampleData[PairedEndSequencesWithQuality]":

rule kraken_analysis:
input:
db="resources/minikraken2_v2_8GB_201904_UPDATE/",
db="resources/filtering-database/",
read1="data/{date}/{sample}_L001_R1_001.fastq.gz",
read2="data/{date}/{sample}_L001_R2_001.fastq.gz",
output:
Expand All @@ -24,7 +24,7 @@ if config["datatype"] == "SampleData[SequencesWithQuality]":

rule kraken_analysis:
input:
db="resources/minikraken2_v2_8GB_201904_UPDATE/",
db="resources/filtering-database/",
read="data/{date}/{sample}_L001_R1_001.fastq.gz",
output:
report="results/{date}/out/kraken/{sample}.kreport2",
Expand Down
4 changes: 2 additions & 2 deletions workflow/rules/reduced_analysis.smk
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ rule demux_stats:
"--verbose 2> {log}"


if config["data-type"] == "human":
if config["data-type"] == "human" and config["bowtie"] == False:

rule visual_humancount:
input:
Expand Down Expand Up @@ -99,7 +99,7 @@ if config["data-type"] == "human":
"../scripts/extract_humancount.py"


if config["data-type"] == "environmental":
if config["data-type"] == "environmental" or config["bowtie"] == True:

rule unzip_human_dummy:
output:
Expand Down

0 comments on commit 0ffdb2a

Please sign in to comment.