diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e1bdbef7..881f6ec2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -77,9 +77,9 @@ jobs: with: directory: .test snakefile: workflow/Snakefile - args: "--use-conda --show-failed-logs --cores 1 --conda-cleanup-pkgs cache --all-temp" + args: "--use-conda --show-failed-logs --cores all --conda-cleanup-pkgs cache --all-temp" - run-3prime-rna-workflow: + run-three-prime-rna-workflow: runs-on: ubuntu-latest needs: - linting @@ -88,15 +88,13 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - with: - submodules: recursive - name: Test 3-prime-workflow uses: snakemake/snakemake-github-action@v1 with: - directory: .test/3-prime-config - snakefile: workflow/Snakefile - args: "--use-conda --show-failed-logs --cores 1 --conda-cleanup-pkgs cache --all-temp" + directory: .test/three_prime + snakefile: .test/three_prime/workflow/Snakefile + args: "--use-conda --show-failed-logs --cores all --conda-cleanup-pkgs cache --all-temp" # Disable report testing for now since we mark all output files as temporary above. # TODO: add some kind of test mode to report generation which does not really try to include # results. diff --git a/.test/3-prime-config/config/samples.tsv b/.test/3-prime-config/config/samples.tsv deleted file mode 100644 index f400f357..00000000 --- a/.test/3-prime-config/config/samples.tsv +++ /dev/null @@ -1,5 +0,0 @@ -sample condition batch_effect -A treated batch1 -B untreated batch1 -C treated batch2 -D untreated batch2 diff --git a/.test/3-prime-config/config/units.tsv b/.test/3-prime-config/config/units.tsv deleted file mode 100644 index 3ece8ee8..00000000 --- a/.test/3-prime-config/config/units.tsv +++ /dev/null @@ -1,6 +0,0 @@ -sample unit fragment_len_mean fragment_len_sd fq1 fq2 -A 1 300 14 ../ngs-test-data/reads/a.chr21.2.fq -B 1 300 14 ../ngs-test-data/reads/b.chr21.1.fq -B 2 300 14 ../ngs-test-data/reads/b.chr21.2.fq -C 1 300 14 ../ngs-test-data/reads/a.chr21.2.fq -D 1 300 14 ../ngs-test-data/reads/b.chr21.2.fq diff --git a/.test/3-prime-config/workflow/Snakefile b/.test/3-prime-config/workflow/Snakefile deleted file mode 100644 index ce03a28f..00000000 --- a/.test/3-prime-config/workflow/Snakefile +++ /dev/null @@ -1,33 +0,0 @@ -from snakemake.utils import min_version - -min_version("7.17.0") - - -configfile: "config/config.yaml" - - -report: "report/workflow.rst" - - -# this container defines the underlying OS for each job when using the workflow -# with --use-conda --use-singularity -container: "docker://continuumio/miniconda3" - - -include: "rules/common.smk" -include: "rules/trim.smk" -include: "rules/trim_3prime.smk" -include: "rules/qc_3prime.smk" -include: "rules/ref.smk" -include: "rules/ref_3prime.smk" -include: "rules/quant.smk" -include: "rules/quant_3prime.smk" -include: "rules/diffexp.smk" -include: "rules/diffsplice.smk" -include: "rules/enrichment.smk" -include: "rules/datavzrd.smk" - - -rule all: - input: - all_input, diff --git a/.test/three_prime/README.md b/.test/three_prime/README.md new file mode 100644 index 00000000..a94fa24e --- /dev/null +++ b/.test/three_prime/README.md @@ -0,0 +1,12 @@ +The testing data for this test case is downloaded from this Zenodo dataset: +https://zenodo.org/doi/10.5281/zenodo.10572745 + +It has been generated by this snakemake workflow: +https://github.com/dlaehnemann/create-quant-seq-testing-dataset + +So it is based from data from this publication: + +Corley, S.M., Troy, N.M., Bosco, A. et al. QuantSeq. 3′ Sequencing combined with Salmon provides a fast, reliable approach for high throughput RNA expression analysis. Sci Rep 9, 18895 (2019). https://doi.org/10.1038/s41598-019-55434-x + +The MSigDB gene sets are used according to their [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/), which is given here: +https://www.gsea-msigdb.org/gsea/msigdb_license_terms.jsp \ No newline at end of file diff --git a/.test/3-prime-config/config/config.yaml b/.test/three_prime/config/config.yaml similarity index 97% rename from .test/3-prime-config/config/config.yaml rename to .test/three_prime/config/config.yaml index f4ff4fa3..d2f62ef1 100644 --- a/.test/3-prime-config/config/config.yaml +++ b/.test/three_prime/config/config.yaml @@ -18,7 +18,7 @@ resources: # ensembl species name species: homo_sapiens # ensembl release version - release: "104" + release: "111" # genome build build: GRCh38 # pfam release to use for annotation of domains in differential splicing analysis @@ -45,12 +45,12 @@ diffexp: # model for sleuth differential expression analysis models: model_X: - full: ~condition + batch_effect - reduced: ~batch_effect + full: ~condition + reduced: ~1 # Binary valued covariate that shall be used for fold change/effect size # based downstream analyses. primary_variable: condition - base_level: untreated + base_level: Control # significance level to use for volcano, ma- and qq-plots sig-level: volcano-plot: 0.05 @@ -82,7 +82,7 @@ enrichment: fdr_genes: 0.05 fdr_go_terms: 0.05 fgsea: - gene_sets_file: "../ngs-test-data/ref/dummy.gmt" + gene_sets_file: "config/gene_sets.gmt" # tool is only run if set to `true` activate: true # if activated, you need to provide a GMT file with gene sets of interest diff --git a/.test/three_prime/config/gene_sets.gmt b/.test/three_prime/config/gene_sets.gmt new file mode 100644 index 00000000..4a7e000d --- /dev/null +++ b/.test/three_prime/config/gene_sets.gmt @@ -0,0 +1,2 @@ +UROSEVIC_RESPONSE_TO_IMIQUIMOD https://www.gsea-msigdb.org/gsea/msigdb/human/geneset/UROSEVIC_RESPONSE_TO_IMIQUIMOD BRCA1 CCL8 CXCL11 IDO1 IFI35 IFI6 IFITM1 IL6 IRF7 ISG15 ISG20 MICB MX1 OAS2 OASL PLAAT4 SECTM1 STAT1 TRAFD1 +KEGG_PROTEASOME https://www.gsea-msigdb.org/gsea/msigdb/human/geneset/KEGG_PROTEASOME IFNG POMP PSMA1 PSMA2 PSMA3 PSMA4 PSMA5 PSMA6 PSMA6P4 PSMA7 PSMA8 PSMB1 PSMB10 PSMB11 PSMB2 PSMB3 PSMB4 PSMB5 PSMB6 PSMB7 PSMB8 PSMB9 PSMC1 PSMC1P4 PSMC2 PSMC3 PSMC4 PSMC5 PSMC6 PSMD1 PSMD11 PSMD12 PSMD13 PSMD14 PSMD2 PSMD3 PSMD4 PSMD6 PSMD7 PSMD8 PSME1 PSME2 PSME3 PSME4 PSMF1 SEM1 \ No newline at end of file diff --git a/.test/three_prime/config/samples.tsv b/.test/three_prime/config/samples.tsv new file mode 100644 index 00000000..db857bfc --- /dev/null +++ b/.test/three_prime/config/samples.tsv @@ -0,0 +1,7 @@ +sample condition +SRR8309096 Control +SRR8309094 Control +SRR8309095 Treated +SRR8309097 Treated +SRR8309098 Control +SRR8309099 Treated \ No newline at end of file diff --git a/.test/three_prime/config/units.tsv b/.test/three_prime/config/units.tsv new file mode 100644 index 00000000..e83b4e11 --- /dev/null +++ b/.test/three_prime/config/units.tsv @@ -0,0 +1,7 @@ +sample unit fragment_len_mean fragment_len_sd fq1 fq2 +SRR8309096 u1 430 43 quant_seq_test_data/SRR8309096.fastq.gz +SRR8309094 u1 430 43 quant_seq_test_data/SRR8309094.fastq.gz +SRR8309095 u1 430 43 quant_seq_test_data/SRR8309095.fastq.gz +SRR8309097 u1 430 43 quant_seq_test_data/SRR8309097.fastq.gz +SRR8309098 u1 430 43 quant_seq_test_data/SRR8309098.fastq.gz +SRR8309099 u1 430 43 quant_seq_test_data/SRR8309099.fastq.gz \ No newline at end of file diff --git a/.test/three_prime/workflow/Snakefile b/.test/three_prime/workflow/Snakefile new file mode 100644 index 00000000..506de131 --- /dev/null +++ b/.test/three_prime/workflow/Snakefile @@ -0,0 +1,44 @@ +from snakemake.utils import min_version + +min_version("7.17.0") + + +configfile: "config/config.yaml" + + +# declare main workflow as a module +module rna_seq_kallisto_sleuth: + snakefile: + "../../../workflow/Snakefile" + config: + config + + +use rule * from rna_seq_kallisto_sleuth + + +rule download_quant_seq_testing_data: + output: + "quant_seq_test_data.tar.gz", + log: + "logs/download_quant_seq_testing_data.log", + shell: + "wget https://zenodo.org/records/10572746/files/quant_seq_test_data.tar.gz 2>{log} " + + +rule extract_quant_seq_testing_data: + input: + "quant_seq_test_data.tar.gz", + output: + "quant_seq_test_data/README.md", + "quant_seq_test_data/SRR8309099.fastq.gz", + "quant_seq_test_data/SRR8309095.fastq.gz", + "quant_seq_test_data/SRR8309096.fastq.gz", + "quant_seq_test_data/samples.tsv", + "quant_seq_test_data/SRR8309097.fastq.gz", + "quant_seq_test_data/SRR8309094.fastq.gz", + "quant_seq_test_data/SRR8309098.fastq.gz", + log: + "logs/extract_quant_seq_testing_data.log", + shell: + "tar xzfv {input} 2>{log}" diff --git a/workflow/resources/datavzrd/diffexp-template.yaml b/workflow/resources/datavzrd/diffexp-template.yaml index 59ba3770..8152cc71 100644 --- a/workflow/resources/datavzrd/diffexp-template.yaml +++ b/workflow/resources/datavzrd/diffexp-template.yaml @@ -247,8 +247,6 @@ views: display-mode: normal gene_desc: display-mode: normal - canonical: - display-mode: normal num_aggregated_transcripts: display-mode: normal sum_mean_obs_counts: diff --git a/workflow/scripts/fgsea.R b/workflow/scripts/fgsea.R index a48e6ebc..63b01ecc 100644 --- a/workflow/scripts/fgsea.R +++ b/workflow/scripts/fgsea.R @@ -4,6 +4,8 @@ sink(log, type="message") library("fgsea") library(snakemake@params[["bioc_species_pkg"]], character.only = TRUE) +# avoid this error: https://github.com/ctlab/fgsea/issues/98 +library("data.table") # provides library("tidyverse") and functions load_bioconductor_package() and # get_prefix_col(), the latter requires snakemake@output[["samples"]] and @@ -61,13 +63,11 @@ if ( (fgsea_res %>% count() %>% pull(n)) == 0 ) { unnested <- fgsea_res %>% unnest(leadingEdge) %>% add_column( - leading_edge_symbol = NA, leading_edge_ens_gene = NA, leading_edge_entrez_id = NA ) %>% dplyr::relocate( c( - leading_edge_symbol, leading_edge_ens_gene, leading_edge_entrez_id, leadingEdge @@ -84,21 +84,18 @@ if ( (fgsea_res %>% count() %>% pull(n)) == 0 ) { annotated <- fgsea_res %>% unnest(leadingEdge) %>% mutate( - leading_edge_symbol = str_to_title(leadingEdge), - leading_edge_entrez_id = mapIds(leading_edge_symbol, x=get(snakemake@params[["bioc_species_pkg"]]), keytype="SYMBOL", column="ENTREZID"), - leading_edge_ens_gene = mapIds(leading_edge_symbol, x=get(snakemake@params[["bioc_species_pkg"]]), keytype="SYMBOL", column="ENSEMBL") + leading_edge_entrez_id = mapIds(leadingEdge, x=get(snakemake@params[["bioc_species_pkg"]]), keytype="SYMBOL", column="ENTREZID"), + leading_edge_ens_gene = mapIds(leadingEdge, x=get(snakemake@params[["bioc_species_pkg"]]), keytype="SYMBOL", column="ENSEMBL") ) %>% group_by(pathway) %>% summarise( leadingEdge = str_c(leadingEdge, collapse = ','), - leading_edge_symbol = str_c(leading_edge_symbol, collapse = ','), leading_edge_entrez_id = str_c(leading_edge_entrez_id, collapse = ','), leading_edge_ens_gene = str_c(leading_edge_ens_gene, collapse = ',') ) %>% inner_join(fgsea_res %>% dplyr::select(-leadingEdge), by = "pathway") %>% dplyr::relocate( c( - leading_edge_symbol, leading_edge_ens_gene, leading_edge_entrez_id, leadingEdge @@ -133,7 +130,7 @@ tg <- plotGseaTable( ) ggsave(filename = snakemake@output[["plot"]], plot = tg, width = 15, height = height, limitsize=FALSE) -collapsed_pathways <- collapsePathways(fgsea_res %>% arrange(pval) %>% filter(padj <= snakemake@params[["gene_set_fdr"]]), gene_sets, ranked_genes) +collapsed_pathways <- collapsePathways(fgsea_res %>% arrange(pval) %>% filter(padj <= snakemake@params[["gene_set_fdr"]]) %>% data.table(), gene_sets, ranked_genes) main_pathways <- fgsea_res %>% filter(pathway %in% collapsed_pathways$mainPathways) %>% arrange(-NES) %>% pull(pathway) selected_gene_sets <- gene_sets[main_pathways] height = .7 * (length(selected_gene_sets) + 2) diff --git a/workflow/scripts/get-transcript-info.R b/workflow/scripts/get-transcript-info.R index d63661a7..59b3751e 100644 --- a/workflow/scripts/get-transcript-info.R +++ b/workflow/scripts/get-transcript-info.R @@ -32,7 +32,7 @@ while (class(mart)[[1]] != "Mart") { # change or make configurable if you want more or # less rounds of tries of all the mirrors if (rounds >= 3) { - stop( + cli_abort( str_c( "Have tried all 4 available Ensembl biomaRt mirrors ", rounds, diff --git a/workflow/scripts/sleuth-diffexp.R b/workflow/scripts/sleuth-diffexp.R index f3679a65..bd8e2da1 100644 --- a/workflow/scripts/sleuth-diffexp.R +++ b/workflow/scripts/sleuth-diffexp.R @@ -26,6 +26,7 @@ mean_var_plot <- plot_mean_var(sleuth_object, ggsave(snakemake@output[["mean_var_plot"]], mean_var_plot) write_results <- function(so, mode, output, output_all) { + print(str_c("Running write_results function in mode: ", mode)) so$pval_aggregate <- FALSE if(mode == "aggregate") { # workaround the following bug-request: @@ -173,6 +174,7 @@ write_results <- function(so, mode, output, output_all) { } else if (mode == "custom") { # load custom ID list id_version_pattern <- "\\.\\d+$" + print(str_c("Loading representative transcripts file: ", snakemake@params[["representative_transcripts"]])) ids <- read_tsv(snakemake@params[["representative_transcripts"]], col_names = "ID")$ID ids <- str_replace(ids, id_version_pattern, "") all <- all %>% @@ -186,9 +188,11 @@ write_results <- function(so, mode, output, output_all) { } # saving qq-plots + print(str_c("Saving qq-plots")) marrange_qq <- marrangeGrob(grobs=qq_list, nrow=1, ncol=1, top = NULL) ggsave(snakemake@output[["qq_plots"]], plot = marrange_qq, width = 14) + print(str_c("Writing RDS file to: ", output_all)) write_rds(all, path = output_all, compress = "none") # add sample expressions all <- all %>% left_join(as_tibble(sleuth_to_matrix(so, "obs_norm", "est_counts"), rownames="target_id"))