diff --git a/.test/3-prime-config/config/config.yaml b/.test/3-prime-config/config/config.yaml index 1a621d9b..f4ff4fa3 100644 --- a/.test/3-prime-config/config/config.yaml +++ b/.test/3-prime-config/config/config.yaml @@ -120,8 +120,11 @@ params: # the reverse reads of paired end sequencing: # * https://cutadapt.readthedocs.io/en/stable/guide.html#trimming-paired-end-reads cutadapt-se: - adapters: "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC" - extra: "-q 20" + # This setup is for Lexogen QuantSeq FWD data, based on (but simplfied): + # https://faqs.lexogen.com/faq/what-is-the-adapter-sequence-i-need-to-use-for-t-1 + # For more details, see the QuantSeq section in the `config/README.md` file. + adapters: "-a 'r1adapter=AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=7;max_error_rate=0.005'" + extra: "--minimum-length 33 --nextseq-trim=20 --poly-a" # reasoning behind parameters: # For reads that are produced by 3’-end sequencing, depending on the protocol, it might be recommended to remove some leading bases (e.g. see https://www.nature.com/articles/s41598-019-55434-x#Sec10) # * `--minimum-length 33`: @@ -134,5 +137,5 @@ params: # * `--minimum-overlap 7`: the cutadapt default minimum overlap of `5` did trimming on the level # of expected adapter matches by chance cutadapt-pe: - adapters: "-a ACGGATCGATCGATCGATCGAT -g GGATCGATCGATCGATCGAT -A ACGGATCGATCGATCGATCGAT -G GGATCGATCGATCGATCGAT" - extra: "--minimum-length 33 -e 0.005 --overlap 7" \ No newline at end of file + adapters: "" + extra: "" \ No newline at end of file diff --git a/config/README.md b/config/README.md index 1aef098b..bc994104 100644 --- a/config/README.md +++ b/config/README.md @@ -1,16 +1,72 @@ # General settings -To configure this workflow, modify ``config/config.yaml`` according to your needs, following the explanations provided in the file. +To configure this workflow, modify the following files to reflect your dataset and differential expression analysis model: +* `config/samples.tsv`: samples sheet with covariates and conditions +* `config/units.tsv`: (sequencing) units sheet with raw data paths +* `config/config.yaml`: general workflow configuration and differential expression model setup -# Sample and unit sheet +## samples sheet -* Add samples to `config/samples.tsv`. For each sample, the column `sample` and any number of covariates and conditions have to be specified. - These columns can be used in the model specification section within `config/config.yaml` to define the differential expression analysis of sleuth, including batch effect modeling. - Effect size estimates are calculated as so-called beta-values by sleuth. For binary comparisons, they resemble a log2 fold change. The primary comparison variable should be encoded in a way such that the group of samples that shall be the numerator in the fold change calculation have the lexicographically greater value. For example, you could simply use values like `0` and `1` to achieve this. -* For each sample, add one or more sequencing units (runs, lanes or replicates) to the unit sheet `config/units.tsv`. - For each unit, provide either one (column `fq1`) or two (columns `fq1`, `fq2`) FASTQ files (these can point to anywhere in your system). - If only one FASTQ file is provided (single-end data), you also need to specify `fragment_len_mean` and `fragment_len_sd`, which should usually be available from your sequencing provider. +For each biological sample, add a line to the sample sheet in `config/samples.tsv`. +The column `sample` is required and gives the sample name. +Additional columns can specify covariates (including batch effects) and conditions. +These columns can then be used in the `diffexp: models:` specification section in `config/config.yaml` (see below) Missing values can be specified by empty columns or by writing `NA`. +## units sheet + +For each sample, add one or more sequencing unit lines (runs, lanes or replicates) to the unit sheet in `config/units.tsv`. +For each unit, provide either of the following: +* The path to two pairead-end FASTQ files in the columns `fq1`, `fq2`. +* The path to a single-end FASTQ file in the column `fq1`. + For single-end data, you also need to specify `fragment_len_mean` and `fragment_len_sd`, which should usually be available from your sequencing provider. + +Missing values can be specified by empty columns or by writing `NA`. + +## config.yaml + +This file contains the general workflow configuration and the setup for the differential expression analysis performed by sleuth. +Configurable options should be explained in the comments above the respective entry or right here in this `config/README.md` section. +If something is unclear, don't hesitate to [file an issue in the `rna-seq-kallisto-sleuth` GitHub repository](https://github.com/snakemake-workflows/rna-seq-kallisto-sleuth/issues/new/choose). + +### differential expression model setup + +The core functionality of this workflow is provided by the software [`sleuth`](https://pachterlab.github.io/sleuth/about). +You can use it to test for differential expression of genes or transcripts between two or more subgroups of samples. + +#### main sleuth model + +The main idea of sleuth's internal model, is to test a `full:` model (containing (a) variable(s) of interest AND batch effects) against a `reduced:` model (containing ONLY the batch effects). +So these are the most important entries to set up under any model that you specify via `diffexp: models:`. +If you don't know any batch effects, the `reduced:` model will have to be `~1`. +Otherwise it will be the tilde followed by an addition of the names of any columns that contain batch effects, for example: `reduced: ~batch_effect_1 + batch_effect_2`. +The full model than additionally includes variables of interest, so fore example: `full: ~variable_of_interest + batch_effect_1 + batch_effect_2`. + +#### sleuth effect sizes + +Effect size estimates are calculated as so-called beta-values by `sleuth`. +For binary comparisons (your variable of interest has two factor levels), they resemble a log2 fold change. +To know which variable of interest to use for the effect size calculation, you need to provide its column name as the `primary_variable:`. +And for sleuth to know what level of that variable of interest to use as the base level, specify the respective entry as the `base_level:`. + +### Lexogen 3' QuantSeq data analysis + +For Lexogen 3' QuantSeq data analysis, please set `experiment: 3-prime-rna-seq: activate: true` in the `config/config.yaml` file. +For more information information on Lexogen QuantSeq 3' sequencing, see: https://www.lexogen.com/quantseq-3mrna-sequencing/ +In addition, for Lexogen 3' FWD QuantSeq data, we recommend setting the `params: cutadapt-se:` with: +``` + adapters: "-a 'r1adapter=AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=7;max_error_rate=0.005'" + extra: "--minimum-length 33 --nextseq-trim=20 --poly-a" +``` +This is an adaptation of the [Lexogen read preprocessing recommendations for 3' FWD QuantSeq data](https://faqs.lexogen.com/faq/what-is-the-adapter-sequence-i-need-to-use-for-t-1). +Changes to the recommendations are motivated as follows: +* `-m`: We switched to the easier to read `--minimum-length` and apply this minimum length globally. In addition, we increase the minimum length to a default of 33 that makes more sense for kallisto quantification. +* `-O`: Instead of this option, minimum overlap is specified per expression. +* `-a "polyA=A{20}"`: We replace this with `cutudapt`s dedicated option for `--poly-a` tail removal (which is run after adapter trimming). +* `-a "QUALITY=G{20}"`: We replace this with `cutudapt`s dedicated option for the removal artifactual trailing `G`s in NextSeq and NovaSeq data: `--nextseq-trim=20`. +* `-n`: With the dedicated `cutadapt` options getting applied in the right order, this option is not needed any more. +* `-a "r1adapter=A{18}AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=3;max_error_rate=0.100000"`: We remove A{18}, as this is handled by `--poly-a`. We increase `min_overlap` to 7 and set the `max_error_rate` to the Illumina error rate of about 0.005, both to avoid spurious adapter matches being removed. +* `-g "r1adapter=AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=20"`: This is not needed any more, as `-a` option will lead to complete removal of read sequence if adapter is found at the start of the read, see: https://cutadapt.readthedocs.io/en/stable/guide.html#rightmost +* `--discard-trimmed`: We omit this, as the `-a` with the adapter sequence will lead to complete read sequence removal if adapter is found at start, and the `--minimum-length` will then discard such empty reads. diff --git a/workflow/Snakefile b/workflow/Snakefile index ce03a28f..1d3c8401 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -16,7 +16,6 @@ container: "docker://continuumio/miniconda3" include: "rules/common.smk" include: "rules/trim.smk" -include: "rules/trim_3prime.smk" include: "rules/qc_3prime.smk" include: "rules/ref.smk" include: "rules/ref_3prime.smk" diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk index 6efe71ca..51d3754d 100644 --- a/workflow/rules/datavzrd.smk +++ b/workflow/rules/datavzrd.smk @@ -67,7 +67,7 @@ rule spia_datavzrd: log: "logs/datavzrd-report/spia-{model}/spia-{model}.log", wrapper: - "v1.29.0/utils/datavzrd" + "v2.6.0/utils/datavzrd" rule diffexp_datavzrd: @@ -92,7 +92,7 @@ rule diffexp_datavzrd: log: "logs/datavzrd-report/diffexp.{model}/diffexp.{model}.log", wrapper: - "v1.29.0/utils/datavzrd" + "v2.6.0/utils/datavzrd" rule go_enrichment_datavzrd: @@ -119,4 +119,4 @@ rule go_enrichment_datavzrd: log: "logs/datavzrd-report/go_enrichment-{model}/go_enrichment-{model}_{gene_fdr}.go_term_fdr_{go_term_fdr}.log", wrapper: - "v1.29.0/utils/datavzrd" + "v2.6.0/utils/datavzrd" diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk index 4091e29f..4a731559 100644 --- a/workflow/rules/trim.smk +++ b/workflow/rules/trim.smk @@ -12,22 +12,33 @@ rule cutadapt_pe: log: "logs/cutadapt/{sample}-{unit}.log", wrapper: - "v1.22.0/bio/cutadapt/pe" + "v2.6.0/bio/cutadapt/pe" -if not is_3prime_experiment: +rule cutadapt: + input: + get_fastqs, + output: + fastq="results/trimmed/{sample}-{unit}.fastq.gz", + qc="results/trimmed/{sample}-{unit}.qc.txt", + threads: 8 + params: + adapters=config["params"]["cutadapt-se"]["adapters"], + extra=config["params"]["cutadapt-se"]["extra"], + log: + "results/logs/cutadapt/{sample}-{unit}.log", + wrapper: + "v2.6.0/bio/cutadapt/se" + - rule cutadapt: - input: - get_fastqs, - output: - fastq="results/trimmed/{sample}-{unit}.fastq.gz", - qc="results/trimmed/{sample}-{unit}.qc.txt", - threads: 8 - params: - adapters=config["params"]["cutadapt-se"]["adapters"], - extra=config["params"]["cutadapt-se"]["extra"], - log: - "logs/cutadapt/{sample}-{unit}.log", - wrapper: - "v1.22.0/bio/cutadapt/se" +rule max_read_length: + input: + get_all_fastqs, + output: + "results/stats/max-read-length.json", + log: + "logs/max-read-length.log", + conda: + "../envs/pysam.yaml" + script: + "../scripts/get-max-read-length.py" diff --git a/workflow/rules/trim_3prime.smk b/workflow/rules/trim_3prime.smk deleted file mode 100644 index 9c35d7aa..00000000 --- a/workflow/rules/trim_3prime.smk +++ /dev/null @@ -1,87 +0,0 @@ -if is_3prime_experiment and three_prime_vendor == "lexogen": - # Rule cutadapt1 checks and removes poly-A tails and sequence qualilty score <20. - # reasoning behind parameters: - # * `-m 20`: - # * Discards any read under 20bp by -m 20 - # * `-O 20 -a ""polyA=A{20}"" -a ""QUALITY=G{20}"" -n 2`: - # * Removes reads having polyA (18bp) or polyG (18bp) and repeat the process twice by -n 2 - # In short, it removes reads that are predominantly polyA or polyG. - - # Rule cutadapt2 checks if reads contains the polyA-stretch + adapter at the 3' - # end with minimum overlap 3bp, maximum error of 1bp every 10bp - # reasoning behind parameters: - # * `-m 20`: - # * Discards any read under 20bp by -m 20 - # * `--nextseq-trim=10` - # * Quality score<20 is trimmed, based on NextSeq/NovaSeq 2-color SBS system - # * `-a A{18}AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=3;max_error_rate=0.100000` - - # Rule cutadapt3 removes final set of adapters from 5'end and discards reads based on trimmed read length - # reasoning behind parameters: - # * `-m 20`: - # * Discards any read under 20bp by -m 20 - # * `-O 20 -g r1adapter=AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=20 --discard-trimmed"` - # * Trim the 5' end of adapter with miminum overlap of 20bp and discard its read by --discard-trimmed - - # https://faqs.lexogen.com/faq/what-is-the-adapter-sequence-i-need-to-use-for-t-1 - rule cutadapt1: - input: - get_fastqs, - output: - fastq="results/trimmed/{sample}-{unit}.1.1.fastq.gz", - qc="results/trimmed/{sample}-{unit}.1.1.qc.txt", - params: - extra=lambda w: str( - "-m 20 -O 20 -a " "polyA=A{20}" " -a " "QUALITY=G{20}" " -n 2" - ), - log: - "logs/cutadapt/{sample}-{unit}.1.1.log", - wrapper: - "v1.14.1/bio/cutadapt/se" - - rule cutadapt2: - input: - fastq="results/trimmed/{sample}-{unit}.1.1.fastq.gz", - output: - fastq="results/trimmed/{sample}-{unit}.2.1.fastq.gz", - qc="results/trimmed/{sample}-{unit}.2.1.qc.txt", - params: - adapters=lambda w: str( - '-m 20 -O 3 --nextseq-trim=10 -a "' - 'A{18}AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=3;max_error_rate=0.100000"' - "" - ), - log: - "logs/cutadapt/{sample}-{unit}.2.1.log", - wrapper: - "v1.14.1/bio/cutadapt/se" - - rule cutadapt3: - input: - fastq="results/trimmed/{sample}-{unit}.2.1.fastq.gz", - output: - fastq="results/trimmed/{sample}-{unit}.fastq.gz", - qc="results/trimmed/{sample}-{unit}.qc.txt", - params: - adapters=lambda w: str( - '-m 20 -O 20 -g "' - 'r1adapter=AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC;min_overlap=20"' - " --discard-trimmed" - ), - log: - "logs/cutadapt/{sample}-{unit}.log", - wrapper: - "v1.14.1/bio/cutadapt/se" - - -rule max_read_length: - input: - get_all_fastqs, - output: - "results/stats/max-read-length.json", - log: - "logs/max-read-length.log", - conda: - "../envs/pysam.yaml" - script: - "../scripts/get-max-read-length.py" diff --git a/workflow/scripts/plot_diffexp_heatmap.R b/workflow/scripts/plot_diffexp_heatmap.R index c31ace8d..4180992b 100644 --- a/workflow/scripts/plot_diffexp_heatmap.R +++ b/workflow/scripts/plot_diffexp_heatmap.R @@ -53,7 +53,7 @@ if (all(selectedgenes == 0)) { # If number of transcripts is more than 100, # Since for RNA-seq multiple transcripts may present for a single gene. } else if (nrow(selectedgenes) > 100) { - pdf_height <- round(nrow(selectedgenes) / 7) + pdf_height <- round(nrow(selectedgenes) / 5) pdf(snakemake@output[["diffexp_heatmap"]], height = pdf_height, width = ncol(selectedgenes) * 2 ) @@ -63,7 +63,7 @@ if (all(selectedgenes == 0)) { ) dev.off() } else { - pdf_height <- round(nrow(selectedgenes) / 7) + pdf_height <- round(nrow(selectedgenes) / 5) pdf( file = snakemake@output[["diffexp_heatmap"]], height = pdf_height, width = ncol(selectedgenes) * 2 diff --git a/workflow/scripts/spia.R b/workflow/scripts/spia.R index fa68a664..dbd41a5f 100644 --- a/workflow/scripts/spia.R +++ b/workflow/scripts/spia.R @@ -21,7 +21,6 @@ diffexp <- read_tsv(snakemake@input[["diffexp"]]) %>% mutate(ens_gene = str_c("ENSEMBL:", ens_gene)) universe <- diffexp %>% pull(var = ens_gene) sig_genes <- diffexp %>% filter(qval <= 0.05) - if (nrow(sig_genes) == 0) { cols <- c( "Name", "Status", "Combined FDR", @@ -30,7 +29,7 @@ if (nrow(sig_genes) == 0) { "Combined Bonferroni p-values", "p-value to observe a total accumulation", "Combined p-value", "Ids" ) - res <- data.frame(matrix(ncol = 7, nrow = 0, dimnames = list(NULL, cols))) + res <- data.frame(matrix(ncol = 11, nrow = 0, dimnames = list(NULL, cols))) # create empty perturbation plots pdf(file = snakemake@output[["plots"]]) write_tsv(res, snakemake@output[["table"]])