refactor: Renamed ref-genome in rules (#106)

* Soft-coded the ref-genome DB * Including different databases for classification * Changes in Qiime env * fmt * fmt * fmt * fmt * fmt * fmt * fmt test * fmt * fmt * Separating DB download * fmt * fmt * Ref-DB calling changed --------- Co-authored-by: Ann-Kathrin Brüggemann <[email protected]> Co-authored-by: AKBrueggemann <[email protected]>
IKIM-Essen · Feb 16, 2024 · 0ffdb2a · 0ffdb2a
1 parent 1a15959
commit 0ffdb2a
Show file tree

Hide file tree

Showing 14 changed files with 115 additions and 62 deletions.
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -91,14 +91,14 @@ jobs:
       run: |
         cd .tests/resources
         ls -la
-        wget https://data.qiime2.org/2022.2/common/silva-138-99-seqs.qza
-        wget https://data.qiime2.org/2022.2/common/silva-138-99-tax.qza 
-        wget --no-check-certificate https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz
-        gzip -d GRCh38_latest_genomic.fna.gz
-        mkdir minikraken2_v2_8GB_201904_UPDATE/
-        tar -xf small_db.tgz --directory minikraken2_v2_8GB_201904_UPDATE/
+        wget -O ref-seqs.qza https://data.qiime2.org/2022.2/common/silva-138-99-seqs.qza
+        wget -O ref-taxa.qza https://data.qiime2.org/2022.2/common/silva-138-99-tax.qza 
+        wget --no-check-certificate -O ref-genome.fna.gz https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz
+        gzip -d ref-genome.fna.gz
+        mkdir filtering-database/
+        tar -xf small_db.tgz --directory filtering-database/
         rm small_db.tgz
-        cd minikraken2_v2_8GB_201904_UPDATE
+        cd filtering-database
         mv small_db/* .
         rm -r small_db
         cd ../..

diff --git a/.tests/config/config.yaml b/.tests/config/config.yaml
@@ -27,10 +27,16 @@ metadata: config/pep/metadata.txt
 remove-columns: ["site_name"]
 # Paths to the databases used for classification and taxonomy
 database:
+  Silva: True 
   download-path-seq: resources/silva-138-99-seqs.qza
   download-path-tax: resources/silva-138-99-tax.qza
   kraken-db: ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/old/minikraken2_v2_8GB_201904.tgz
   ref-genome: https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz
+  Greengenes: False 
+  gg2-seq: http://ftp.microbio.me/greengenes_release/current/2022.10.backbone.full-length.fna.qza
+  gg2-tax: http://ftp.microbio.me/greengenes_release/current/2022.10.backbone.tax.qza
+  NCBI: False
+  NCBI-query: 33175[BioProject]
 # Forward and reverse adapters used for sequencing
 adapter1: TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG
 adapter2: GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG

diff --git a/config/config.yaml b/config/config.yaml
@@ -27,10 +27,16 @@ metadata: config/pep/metadata.txt
 remove-columns: ["site_name"]
 # Paths to the databases used for classification and taxonomy
 database:
+  Silva: True 
   download-path-seq: https://data.qiime2.org/2022.2/common/silva-138-99-seqs.qza
   download-path-tax: https://data.qiime2.org/2022.2/common/silva-138-99-tax.qza
   kraken-db: ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/old/minikraken2_v2_8GB_201904.tgz
   ref-genome: https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.fna.gz
+  Greengenes: False 
+  gg2-seq: http://ftp.microbio.me/greengenes_release/current/2022.10.backbone.full-length.fna.qza
+  gg2-tax: http://ftp.microbio.me/greengenes_release/current/2022.10.backbone.tax.qza
+  NCBI: False
+  NCBI-query: 33175[BioProject]
 # Forward and reverse adapters used for sequencing
 adapter1: TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG
 adapter2: GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG

diff --git a/workflow/envs/qiime-only-env.yaml b/workflow/envs/qiime-only-env.yaml
@@ -678,6 +678,6 @@ dependencies:
   - zstandard=0.19.0
   - zstd=1.5.2
   - pip:
-    #- git+https://github.com/bokulich-lab/RESCRIPt.git
+    - git+https://github.com/bokulich-lab/RESCRIPt.git
     - git+https://github.com/yxia0125/q2-repeat-rarefy.git
-    #- git+https://github.com/bokulich-lab/q2-types-genomics.git
+    - git+https://github.com/bokulich-lab/q2-types-genomics.git
diff --git a/workflow/rules/bowtie.smk b/workflow/rules/bowtie.smk
@@ -2,9 +2,8 @@ if config["bowtie"] == True:
 
     rule create_bowtie_db:
         input:
-            "resources/GRCh38_latest_genomic_upper.fna",
+            "resources/ref-genome_upper.fna",
         output:
-            #files="resources/bowtie_host_DB",
             dirc=directory("resources/bowtie_DB/"),
         params:
             filename="bowtie_host_DB",

diff --git a/workflow/rules/chimera_checking.smk b/workflow/rules/chimera_checking.smk
@@ -26,8 +26,8 @@ rule chimera_only:
 rule chimera_taxonomy:
     input:
         query="results/{date}/out/rep-seqs-chimeras.qza",
-        reference_reads="resources/silva-138-99-seqs.qza",
-        reference_taxonomy="resources/silva-138-99-tax.qza",
+        reference_reads="resources/ref-seqs.qza",
+        reference_taxonomy="resources/ref-taxa.qza",
     output:
         "results/{date}/out/chimera_taxonomy.qza",
     params:

diff --git a/workflow/rules/classification.smk b/workflow/rules/classification.smk
@@ -45,8 +45,8 @@ if config["DADA2"] == False:
 rule classification:
     input:
         query="results/{date}/out/seq-cluster-filtered.qza",
-        reference_reads="resources/silva-138-99-seqs.qza",
-        reference_taxonomy="resources/silva-138-99-tax.qza",
+        reference_reads="resources/ref-seqs.qza",
+        reference_taxonomy="resources/ref-taxa.qza",
     output:
         tax="results/{date}/out/taxonomy.qza",
         search="results/{date}/out/blast-search-results.qza",

diff --git a/workflow/rules/filtering.smk b/workflow/rules/filtering.smk
@@ -321,7 +321,7 @@ if config["DADA2"] == False:
         input:
             seq="results/{date}/out/derepl-seq.qza",
             table="results/{date}/out/derepl-table.qza",
-            ref_seq="resources/GRCh38_latest_genomic_upper.qza",
+            ref_seq="resources/ref-genome_upper.qza",
         output:
             seq="results/{date}/out/derep-seq-nonhum.qza",
             table="results/{date}/out/derep-table-nonhum.qza",

diff --git a/workflow/rules/get_Greengenes_DB.smk b/workflow/rules/get_Greengenes_DB.smk
@@ -0,0 +1,21 @@
+if (
+    config["database"]["Greengenes"] == True,
+    config["database"]["Silva"] == False,
+    config["database"]["NCBI"] == False,
+):
+
+    rule get_greengenes:
+        output:
+            seq="resources/ref-seqs.qza",
+            tax="resources/ref-taxa.qza",
+        params:
+            seq=str(config["database"]["gg2-seq"]),
+            tax=str(config["database"]["gg2-tax"]),
+        log:
+            "logs/prep_Greengenes.log",
+        conda:
+            "../envs/python.yaml"
+        shell:
+            "cd resources; "
+            "wget -O ref-seqs.qza {params.seq}; "
+            "wget -O ref-taxa.qza {params.tax}; "
diff --git a/workflow/rules/get_NCBI_DB.smk b/workflow/rules/get_NCBI_DB.smk
@@ -0,0 +1,22 @@
+if (
+    config["database"]["NCBI"] == True,
+    config["database"]["Silva"] == False,
+    config["database"]["Greengenes"] == False,
+):
+
+    rule get_NCBI_ref:
+        output:
+            seq="resources/ref-seqs.qza",
+            tax="resources/ref-taxa.qza",
+        params:
+            query=config["database"]["NCBI-query"],
+        log:
+            "logs/prep_NCBI.log",
+        conda:
+            "../envs/qiime-only-env.yaml"
+        shell:
+            "qiime rescript get-ncbi-data "
+            "--p-query {params.query} "
+            "--o-sequences {output.seq} "
+            "--o-taxonomy {output.tax} "
+            "--verbose 2> {log} "
diff --git a/workflow/rules/get_SILVA_DB.smk b/workflow/rules/get_SILVA_DB.smk
@@ -0,0 +1,21 @@
+if (
+    config["database"]["Silva"] == True,
+    config["database"]["Greengenes"] == False,
+    config["database"]["NCBI"] == False,
+):
+
+    rule get_SILVA:
+        output:
+            seq="resources/ref-seqs.qza",
+            tax="resources/ref-taxa.qza",
+        params:
+            seq=str(config["database"]["download-path-seq"]),
+            tax=str(config["database"]["download-path-tax"]),
+        log:
+            "logs/prep_SILVA.log",
+        conda:
+            "../envs/python.yaml"
+        shell:
+            "cd resources; "
+            "wget -O ref-seqs.qza {params.seq}; "
+            "wget -O ref-taxa.qza {params.tax}; "
diff --git a/workflow/rules/preprocessing.smk b/workflow/rules/preprocessing.smk
@@ -1,12 +1,8 @@
 rule get_database:
     output:
-        seq="resources/silva-138-99-seqs.qza",
-        tax="resources/silva-138-99-tax.qza",
-        genomic="resources/GRCh38_latest_genomic.fna.gz",
-        kraken="resources/minikraken2_v2_8GB_201904.tgz",
+        genomic=temp("resources/ref-genome.fna.gz"),
+        kraken=temp("resources/filtering-database.tgz"),
     params:
-        seq=str(config["database"]["download-path-seq"]),
-        tax=str(config["database"]["download-path-tax"]),
         genomic=str(config["database"]["ref-genome"]),
         kraken=str(config["database"]["kraken-db"]),
     log:
@@ -15,31 +11,8 @@ rule get_database:
         "../envs/python.yaml"
     shell:
         "cd resources; "
-        "wget {params.genomic}; "
-        "wget {params.kraken}; "
-        "wget {params.seq}; "
-        "wget {params.tax}; "
-
-
-# rule get_SILVA:
-#    output:
-#        seq_rna=temp("resources/silva-138.1-ssu-nr99-rna-seqs.qza"),
-#        tax=temp("resources/silva-138-99-tax.qza"),
-#    params:
-#        version="138.1",
-#        target="SSURef_NR99",
-#    log:
-#        "logs/prerp_SILVA.log",
-#    conda:
-#        "../envs/qiime-only-env.yaml"
-#    shell:
-#        "qiime rescript get-silva-data "
-#        "--p-version {params.version} "
-#        "--p-target {params.target} "
-#        "--p-include-species-labels "
-#        "--o-silva-sequences {output.seq_rna} "
-#        "--o-silva-taxonomy {output.tax} "
-#        "2> {log}"
+        "wget -O ref-genome.fna.gz {params.genomic}; "
+        "wget -O filtering-database.tgz {params.kraken}; "
 
 
 # rule rna_to_dna_SILVA:
@@ -56,13 +29,11 @@ rule get_database:
 #       "--i-rna-sequences {input} "
 #       "--o-dna-sequences {output} "
 #       "2> {log}"
-
-
 rule unzip_ref_gen:
     input:
-        "resources/GRCh38_latest_genomic.fna.gz",
+        "resources/ref-genome.fna.gz",
     output:
-        temp("resources/GRCh38_latest_genomic.fna"),
+        temp("resources/ref-genome.fna"),
     log:
         "logs/unzip_ref_gen.log",
     conda:
@@ -73,9 +44,9 @@ rule unzip_ref_gen:
 
 rule lower_to_upper:
     input:
-        "resources/GRCh38_latest_genomic.fna",
+        "resources/ref-genome.fna",
     output:
-        temp("resources/GRCh38_latest_genomic_upper.fna"),
+        temp("resources/ref-genome_upper.fna"),
     log:
         "logs/lower_to_upper_fasta.log",
     conda:
@@ -86,9 +57,9 @@ rule lower_to_upper:
 
 rule import_ref_genome:
     input:
-        "resources/GRCh38_latest_genomic_upper.fna",
+        "resources/ref-genome_upper.fna",
     output:
-        temp("resources/GRCh38_latest_genomic_upper.qza"),
+        temp("resources/ref-genome_upper.qza"),
     log:
         "logs/import_ref_gen.log",
     conda:
@@ -103,15 +74,22 @@ rule import_ref_genome:
 
 rule unzip_kraken:
     input:
-        "resources/minikraken2_v2_8GB_201904.tgz",
+        "resources/filtering-database.tgz",
     output:
-        temp(directory("resources/minikraken2_v2_8GB_201904_UPDATE")),
+        temp(directory("resources/filtering-database")),
     log:
         "logs/unzip_kraken_db.log",
     conda:
         "../envs/python.yaml"
     shell:
-        "tar -zxvf {input} -C resources;"
+        "cd resources; "
+        "mkdir filtering-database; "
+        "cd ..;"
+        "tar -zxvf {input} --directory resources/filtering-database;"
+        "cd resources/filtering-database;"
+        "cp */* .;"
+        "cd ..;"
+        "rm -rf filtering-database/*/;"
 
 
 if config["bowtie"] == False:

diff --git a/workflow/rules/qualitycontroll.smk b/workflow/rules/qualitycontroll.smk
@@ -2,7 +2,7 @@ if config["datatype"] == "SampleData[PairedEndSequencesWithQuality]":
 
     rule kraken_analysis:
         input:
-            db="resources/minikraken2_v2_8GB_201904_UPDATE/",
+            db="resources/filtering-database/",
             read1="data/{date}/{sample}_L001_R1_001.fastq.gz",
             read2="data/{date}/{sample}_L001_R2_001.fastq.gz",
         output:
@@ -24,7 +24,7 @@ if config["datatype"] == "SampleData[SequencesWithQuality]":
 
     rule kraken_analysis:
         input:
-            db="resources/minikraken2_v2_8GB_201904_UPDATE/",
+            db="resources/filtering-database/",
             read="data/{date}/{sample}_L001_R1_001.fastq.gz",
         output:
             report="results/{date}/out/kraken/{sample}.kreport2",

diff --git a/workflow/rules/reduced_analysis.smk b/workflow/rules/reduced_analysis.smk
@@ -62,7 +62,7 @@ rule demux_stats:
         "--verbose 2> {log}"
 
 
-if config["data-type"] == "human":
+if config["data-type"] == "human" and config["bowtie"] == False:
 
     rule visual_humancount:
         input:
@@ -99,7 +99,7 @@ if config["data-type"] == "human":
             "../scripts/extract_humancount.py"
 
 
-if config["data-type"] == "environmental":
+if config["data-type"] == "environmental" or config["bowtie"] == True:
 
     rule unzip_human_dummy:
         output: