diff --git a/CHANGELOG.md b/CHANGELOG.md index 51953318..775841d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Update template to v2.13.1 ([#309](https://github.com/nf-core/scrnaseq/pull/309)) - Update to kallisto|bustools v0.28.2 ([#294](https://github.com/nf-core/scrnaseq/pull/294)) - Fix cellrangerarc matrix conversions and protocol selection ([#300](https://github.com/nf-core/scrnaseq/pull/300)) +- Add new emptydrops calling module ([#301](https://github.com/nf-core/scrnaseq/pull/301)) ## v2.5.1 diff --git a/bin/emptydrops_cell_calling.R b/bin/emptydrops_cell_calling.R new file mode 100755 index 00000000..23a45267 --- /dev/null +++ b/bin/emptydrops_cell_calling.R @@ -0,0 +1,52 @@ +#!/usr/bin/env Rscript +library("DropletUtils") +library("Matrix") + +args <- commandArgs(trailingOnly=TRUE) + +fn_mtx <- args[1] +fn_barcodes <- args[2] +fn_genes <- args[3] +outdir <- args[4] +aligner <- args[5] + +# Read matrix/barcodes/genes +genes <- read.table(fn_genes,sep='\t') +barcodes <- read.table(fn_barcodes,sep='\t') +mtx <- readMM(fn_mtx) + +get_name <- function(file) { + name <- as.character(basename(file)) + name <- gsub('\\.gz$', '', name) + return(name) +} + +# transpose matrices when required +# based on code of 'mtx_to_seurat.R', only the data from kallisto and alevin would require transposition +print("Only kallisto and alevin have transposed matrices.") +if (aligner %in% c( "kallisto", "alevin" )) { + is_transposed <- TRUE + mtx<-t(mtx) +} else { + is_transposed <- FALSE +} + + +# Call empty drops +e.out <- emptyDrops(mtx) +is.cell <- e.out$FDR <= 0.01 + +# Slice matrix and barcodes +mtx_filtered <-mtx[,which(is.cell),drop=FALSE] +barcodes_filtered<-barcodes[which(is.cell),] + +# If matrix was transposed early, need to transpose back +if (is_transposed){ + mtx_filtered<-t(mtx_filtered) + print('Transposing back matrix.') +} + +# Write output +writeMM(mtx_filtered,file.path(outdir,get_name(fn_mtx))) +write.table(barcodes_filtered,file=file.path(outdir,get_name(fn_barcodes)),col.names=FALSE,row.names=FALSE,sep='\t',quote=FALSE) +write.table(genes,file=file.path(outdir,get_name(fn_genes)),col.names=FALSE,row.names=FALSE,sep='\t',quote=FALSE) diff --git a/bin/mtx_to_h5ad.py b/bin/mtx_to_h5ad.py index 3282122d..2f5dc9ba 100755 --- a/bin/mtx_to_h5ad.py +++ b/bin/mtx_to_h5ad.py @@ -32,9 +32,13 @@ def _mtx_to_adata( aligner: str, ): adata = sc.read_mtx(mtx_file) - if ( - aligner == "star" - ): # for some reason star matrix comes transposed and doesn't fit when values are appended directly + # for some reason star matrix comes transposed and doesn't fit when values are appended directly + # also true for cellranger files ( this is only used when running with the custom emptydrops_filtered files ) + # otherwise, it uses the cellranger .h5 files + if aligner in [ + "cellranger", + "star", + ]: adata = adata.transpose() adata.obs_names = pd.read_csv(barcode_file, header=None, sep="\t")[0].values @@ -57,22 +61,36 @@ def input_to_adata( if verbose and (txp2gene or star_index): print("Reading in {}".format(input_data)) - if aligner == "cellranger": + # + # open main data + # + if aligner == "cellranger" and input_data.lower().endswith('.h5'): adata = _10x_h5_to_adata(input_data, sample) else: adata = _mtx_to_adata(input_data, barcode_file, feature_file, sample, aligner) + # + # open gene information + # if verbose and (txp2gene or star_index): print("Reading in {}".format(txp2gene)) - if txp2gene: - t2g = pd.read_table(txp2gene, header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2]) - elif star_index: - t2g = pd.read_table( - f"{star_index}/geneInfo.tab", header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1] - ) - - if txp2gene or star_index: + if aligner == "cellranger" and not input_data.lower().endswith('.h5'): + # + # for cellranger workflow, we do not have a txp2gene file, so, when using this normal/manual function for empty drops + # we need to provide this information coming directly from the features.tsv file + # by not using the .h5 file for conversion, we loose the two col information: feature_types and genome + # + t2g = pd.read_table(feature_file, header=None, names=["gene_id", "gene_symbol", "feature_types"], usecols=[0, 1, 2]) + else: + if txp2gene: + t2g = pd.read_table(txp2gene, header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2]) + elif star_index: + t2g = pd.read_table( + f"{star_index}/geneInfo.tab", header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1] + ) + + if txp2gene or star_index or (aligner == "cellranger" and not input_data.lower().endswith('.h5')): t2g = t2g.drop_duplicates(subset="gene_id").set_index("gene_id") adata.var["gene_symbol"] = t2g["gene_symbol"] diff --git a/bin/mtx_to_seurat.R b/bin/mtx_to_seurat.R index f2680838..99ce2f73 100755 --- a/bin/mtx_to_seurat.R +++ b/bin/mtx_to_seurat.R @@ -3,23 +3,40 @@ library(Seurat) args <- commandArgs(trailingOnly=TRUE) -mtx_file <- args[1] -barcode_file <- args[2] -feature_file <- args[3] -out.file <- args[4] -aligner <- args[5] +mtx_file <- args[1] +barcode_file <- args[2] +feature_file <- args[3] +out.file <- args[4] +aligner <- args[5] +is_emptydrops <- args[6] + +if (is_emptydrops == "--is_emptydrops") { + is_emptydrops <- TRUE +} else{ + is_emptydrops <- FALSE +} -if(aligner %in% c("kallisto", "alevin")) { +if (aligner %in% c( "kallisto", "alevin" )) { + print("1") # for kallisto and alevin, the features file contains only one column and matrix needs to be transposed expression.matrix <- ReadMtx( mtx = mtx_file, features = feature_file, cells = barcode_file, feature.column = 1, mtx.transpose = TRUE ) } else { - expression.matrix <- ReadMtx( - mtx = mtx_file, features = feature_file, cells = barcode_file - ) + if (aligner %in% c( "cellranger", "star" ) && is_emptydrops) { + print("2") + expression.matrix <- ReadMtx( + mtx = mtx_file, features = feature_file, cells = barcode_file, feature.column = 1 + ) + } else{ + print("3") + expression.matrix <- ReadMtx( + mtx = mtx_file, features = feature_file, cells = barcode_file + ) + } } + seurat.object <- CreateSeuratObject(counts = expression.matrix) dir.create(basename(dirname(out.file)), showWarnings = FALSE) diff --git a/conf/modules.config b/conf/modules.config index d4f3d531..b69f5f4e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -29,6 +29,7 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, @@ -46,6 +47,20 @@ process { ] } + if (!params.skip_emptydrops) { + withName: EMPTYDROPS_CELL_CALLING { + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> + if ( params.aligner == 'cellranger' ) "count/${meta.id}/${filename}" + else if ( params.aligner == 'kallisto' ) "${meta.id}.count/${filename}" + else "${meta.id}/${filename}" + } + ] + } + } + withName: 'MTX_TO_H5AD|CONCAT_H5AD|MTX_TO_SEURAT' { publishDir = [ path: { "${params.outdir}/${params.aligner}/mtx_conversions" }, @@ -205,11 +220,12 @@ if (params.aligner == 'kallisto') { ] } withName: KALLISTOBUSTOOLS_COUNT { + def kb_filter = (params.kb_filter) ? '--filter' : '' publishDir = [ path: { "${params.outdir}/${params.aligner}" }, mode: params.publish_dir_mode ] - ext.args = "--workflow ${params.kb_workflow}" + ext.args = "--workflow ${params.kb_workflow} ${kb_filter}" } } } diff --git a/conf/test.config b/conf/test.config index 45ee54c8..08ab1b69 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,7 +20,8 @@ params { max_time = '6.h' // Input data - input = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/samplesheet-2-0.csv' + input = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/samplesheet-2-0.csv' + skip_emptydrops = true // module does not work on small dataset // Genome references fasta = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/reference/GRCm38.p6.genome.chr19.fa' diff --git a/docs/output.md b/docs/output.md index 7e9f0cd8..cff2d442 100644 --- a/docs/output.md +++ b/docs/output.md @@ -19,6 +19,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Cellranger](#cellranger) - [Cellranger ARC](#cellranger-arc) - [UniverSC](#universc) + - [Custom emptydrops filter](#custom-emptydrops-filter) - [Other output data](#other-output-data) - [MultiQC](#multiqc) - [Pipeline information](#pipeline-information) @@ -128,6 +129,16 @@ Battenberg, K., Kelly, S.T., Ras, R.A., Hetherington, N.A., Hayashi, K., and Min - Contains the mapped BAM files, filtered and unfiltered HDF5 matrices and output metrics created by the open-source implementation of Cell Ranger run via UniverSC +## Custom emptydrops filter + +The pipeline also possess a module to perform empty-drops calling and filtering with a custom-made script that uses a library called `bioconductor-dropletutils` that is available in `bioconda`. The process is simple, it takes a raw/unfiltered matrix file, and performs the empty-drops calling and filtering on it, generating another matrix file. + +> Users can turn it of with `--skip_emptydrops`. + +**Output directory: `results/${params.aligner}/emptydrops_filtered`** + +- Contains the empty-drops filtered matrices results generated by the `bioconductor-dropletutils` custom script + ## Other output data **Output directory: `results/reference_genome`** @@ -143,6 +154,21 @@ Battenberg, K., Kelly, S.T., Ras, R.A., Hetherington, N.A., Hayashi, K., and Min - `*_matrix.h5ad` - `.mtx` files converted to [AnnData](https://anndata.readthedocs.io/en/latest/) in `.h5ad` format, using [scanpy package](https://scanpy.readthedocs.io/en/stable/). - One per sample and a single one with all samples concatenated together `combined_matrix.h5ad` +- `*_matrix.rds` + - `.mtx` files converted to R native data format, rds, using the [Seurat package](https://github.com/satijalab/seurat) + - One per sample + +Because the pipeline has both the data directly from the aligners, and from the custom empty-drops filtering module the conversion modules were modified to understand the difference between raw/filtered from the aligners itself and filtered from the custom empty-drops module. So, to try to avoid confusion by the user, we added "suffixes" to the generated converted files so that we have provenance from what input it came from. + +So, the conversion modules generate data with the following syntax: **`*_{raw,filtered,custom_emptydrops_filter}_matrix.{h5ad,rds}`**. With the following meanings: + +| suffix | meaning | +| :----------------------- | :--------------------------------------------------------------------------------------------------------------------------------------- | +| raw | Conversion of the raw/unprocessed matrix generated by the tool. It is also used for tools that generate only one matrix, such as alevin. | +| filtered | Conversion of the filtered/processed matrix generated by the tool | +| custom_emptydrops_filter | Conversion of the matrix that was generated by the new custom empty drops filter module | + +> Some aligners, like `alevin` do not produce both raw&filtered matrices. When aligners give only one output, they are treated with the `raw` suffix. Some aligners may have an option to give both raw&filtered and only one, like `kallisto`. Be aware when using the tools. ## MultiQC diff --git a/main.nf b/main.nf index 3471e5b0..7d8ba356 100644 --- a/main.nf +++ b/main.nf @@ -17,10 +17,9 @@ nextflow.enable.dsl = 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { SCRNASEQ } from './workflows/scrnaseq' +include { SCRNASEQ } from './workflows/scrnaseq' include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_scrnaseq_pipeline' include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_scrnaseq_pipeline' - include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_scrnaseq_pipeline' /* @@ -28,9 +27,9 @@ include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_scrn GENOME PARAMETER VALUES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ - -params.fasta = getGenomeAttribute('fasta') -params.gtf = getGenomeAttribute('gtf') +// we cannot modify params. here, we must load the files +ch_genome_fasta = params.genome ? file( getGenomeAttribute('fasta'), checkIfExists: true ) : [] +ch_gtf = params.genome ? file( getGenomeAttribute('gtf'), checkIfExists: true ) : [] /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -45,6 +44,8 @@ workflow NFCORE_SCRNASEQ { take: samplesheet // channel: samplesheet read in from --input + ch_genome_fasta + ch_gtf main: @@ -52,7 +53,9 @@ workflow NFCORE_SCRNASEQ { // WORKFLOW: Run pipeline // SCRNASEQ ( - samplesheet + samplesheet, + ch_genome_fasta, + ch_gtf ) emit: @@ -86,7 +89,9 @@ workflow { // WORKFLOW: Run main workflow // NFCORE_SCRNASEQ ( - PIPELINE_INITIALISATION.out.samplesheet + PIPELINE_INITIALISATION.out.samplesheet, + ch_genome_fasta, + ch_gtf ) // diff --git a/modules.json b/modules.json index 32171104..5d8614a7 100644 --- a/modules.json +++ b/modules.json @@ -7,7 +7,7 @@ "nf-core": { "cellranger/count": { "branch": "master", - "git_sha": "a2dfd9a0b2e192243695711c723d652959de39fc", + "git_sha": "92ca535c5a8c0fe89eb71e649ee536bd355ce4fc", "installed_by": ["modules"] }, "cellranger/mkgtf": { @@ -52,7 +52,7 @@ }, "kallistobustools/count": { "branch": "master", - "git_sha": "de8215983defba48cd81961d620a9e844f11c7e7", + "git_sha": "9d3e489286eead7dfe1010fd324904d8b698eca7", "installed_by": ["modules"] }, "kallistobustools/ref": { diff --git a/modules/local/concat_h5ad.nf b/modules/local/concat_h5ad.nf index 96920f9e..cd08cbbe 100644 --- a/modules/local/concat_h5ad.nf +++ b/modules/local/concat_h5ad.nf @@ -7,7 +7,7 @@ process CONCAT_H5AD { 'biocontainers/scanpy:1.7.2--pyhdfd78af_0' }" input: - path h5ad + tuple val(input_type), path(h5ad) path samplesheet output: @@ -20,7 +20,7 @@ process CONCAT_H5AD { """ concat_h5ad.py \\ --input $samplesheet \\ - --out combined_matrix.h5ad \\ + --out combined_${input_type}_matrix.h5ad \\ --suffix "_matrix.h5ad" """ diff --git a/modules/local/emptydrops.nf b/modules/local/emptydrops.nf new file mode 100644 index 00000000..e0b77435 --- /dev/null +++ b/modules/local/emptydrops.nf @@ -0,0 +1,101 @@ +process EMPTYDROPS_CELL_CALLING { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::bioconductor-dropletutils" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-dropletutils:1.18.0--r42hf17093f_1' : + 'quay.io/biocontainers/bioconductor-dropletutils:1.18.0--r42hf17093f_1' }" + + input: + // inputs from cellranger nf-core module does not come in a single sample dir + // for each sample, the sub-folders and files come directly in array. + tuple val(meta), path(inputs) + + output: + tuple val(meta), path("emptydrops_filtered"), emit: filtered_matrices + + when: + task.ext.when == null || task.ext.when + + script: + if (params.aligner == "cellranger") { + + matrix = "matrix.mtx.gz" + barcodes = "barcodes.tsv.gz" + features = "features.tsv.gz" + + } else if (params.aligner == "kallisto") { + + matrix = "counts_unfiltered/*.mtx" + barcodes = "counts_unfiltered/*.barcodes.txt" + features = "counts_unfiltered/*.genes.names.txt" + + // kallisto allows the following workflows: ["standard", "lamanno", "nac"] + // lamanno creates "spliced" and "unspliced" + // nac creates "nascent", "ambiguous" "mature" + // also, lamanno produces a barcodes and genes file for both spliced and unspliced + // while nac keep only one for all the different .mtx files produced + kb_non_standard_files = "" + if (params.kb_workflow == "lamanno") { + kb_non_standard_files = "spliced unspliced" + matrix = "counts_unfiltered/\${input_type}.mtx" + barcodes = "counts_unfiltered/\${input_type}.barcodes.txt" + features = "counts_unfiltered/\${input_type}.genes.txt" + } + if (params.kb_workflow == "nac") { + kb_non_standard_files = "nascent ambiguous mature" + matrix = "counts_unfiltered/*\${input_type}.mtx" + features = "counts_unfiltered/*.genes.txt" + } // barcodes tsv has same pattern as standard workflow + + } else if (params.aligner == "alevin") { + + matrix = "*_alevin_results/af_quant/alevin/quants_mat.mtx" + barcodes = "*_alevin_results/af_quant/alevin/quants_mat_rows.txt" + features = "*_alevin_results/af_quant/alevin/quants_mat_cols.txt" + + } else if (params.aligner == 'star') { + + matrix = "raw/matrix.mtx.gz" + barcodes = "raw/barcodes.tsv.gz" + features = "raw/features.tsv.gz" + + } + + // + // run script + // + if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') + """ + # convert file types + for input_type in ${kb_non_standard_files} ; do + mkdir -p emptydrops_filtered/\${input_type} + emptydrops_cell_calling.R \\ + ${matrix} \\ + ${barcodes} \\ + ${features} \\ + emptydrops_filtered/\${input_type} \\ + ${params.aligner} \\ + 0 + done + """ + + else + """ + mkdir emptydrops_filtered/ + emptydrops_cell_calling.R \\ + $matrix \\ + $barcodes \\ + $features \\ + emptydrops_filtered \\ + ${params.aligner} \\ + 0 + """ + + stub: + """ + mkdir emptydrops_filtered + touch emptydrops_filtered/empty_file + """ +} diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf index c991b695..ba8a807e 100644 --- a/modules/local/mtx_to_h5ad.nf +++ b/modules/local/mtx_to_h5ad.nf @@ -15,52 +15,100 @@ process MTX_TO_H5AD { path star_index output: - path "${meta.id}/*h5ad", emit: h5ad - path "${meta.id}/*", emit: counts - path "versions.yml", emit: versions + tuple val(input_type), path("${meta.id}/*h5ad") , emit: h5ad + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: - // def file paths for aligners (except cellranger) - if (params.aligner == 'kallisto') { - mtx_matrix = "*count/counts_unfiltered/*.mtx" - barcodes_tsv = "*count/counts_unfiltered/*.barcodes.txt" - features_tsv = "*count/counts_unfiltered/*.genes.names.txt" + // Get a file to check input type. Some aligners bring arrays instead of a single file. + def input_to_check = (inputs instanceof String) ? inputs : inputs[0] + + // check input type of inputs + input_type = (input_to_check.toUriString().contains('unfiltered') || input_to_check.toUriString().contains('raw')) ? 'raw' : 'filtered' + if ( params.aligner == 'alevin' ) { input_type = 'raw' } // alevin has its own filtering methods and mostly output a single mtx, raw here means, the base tool output + if (input_to_check.toUriString().contains('emptydrops')) { input_type = 'custom_emptydrops_filter' } + + // def file paths for aligners. Cellranger is normally converted with the .h5 files + // However, the emptydrops call, always generate .mtx files, thus, cellranger 'emptydrops' required a parsing + if (params.aligner in [ 'cellranger', 'cellrangerarc' ] && input_type == 'custom_emptydrops_filter') { + + aligner = 'cellranger' + txp2gene = '' + star_index = '' + mtx_matrix = "emptydrops_filtered/matrix.mtx" + barcodes_tsv = "emptydrops_filtered/barcodes.tsv" + features_tsv = "emptydrops_filtered/features.tsv" + + } else if (params.aligner == 'kallisto') { + + kb_pattern = (input_type == 'raw') ? 'un' : '' + mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : "counts_${kb_pattern}filtered" + if ((input_type == 'custom_emptydrops_filter') && (params.kb_workflow != 'standard')) { mtx_dir = 'emptydrops_filtered/\${input_type}' } // dir has subdirs for non-standard workflows + mtx_matrix = "${mtx_dir}/*.mtx" + barcodes_tsv = "${mtx_dir}/*.barcodes.txt" + features_tsv = "${mtx_dir}/*.genes.names.txt" + + // kallisto allows the following workflows: ["standard", "lamanno", "nac"] + // lamanno creates "spliced" and "unspliced" + // nac creates "nascent", "ambiguous" "mature" + // also, lamanno produces a barcodes and genes file for both spliced and unspliced + // while nac keep only one for all the different .mtx files produced + kb_non_standard_files = "" + if (params.kb_workflow == "lamanno") { + kb_non_standard_files = "spliced unspliced" + matrix = "${mtx_dir}/\${input_type}.mtx" + barcodes_tsv = "${mtx_dir}/\${input_type}.barcodes.txt" + features_tsv = "${mtx_dir}/\${input_type}.genes.txt" + } + if (params.kb_workflow == "nac") { + kb_non_standard_files = "nascent ambiguous mature" + matrix = "${mtx_dir}/*\${input_type}.mtx" + features_tsv = "${mtx_dir}/*.genes.txt" + } // barcodes tsv has same pattern as standard workflow + } else if (params.aligner == 'alevin') { - mtx_matrix = "*_alevin_results/af_quant/alevin/quants_mat.mtx" - barcodes_tsv = "*_alevin_results/af_quant/alevin/quants_mat_rows.txt" - features_tsv = "*_alevin_results/af_quant/alevin/quants_mat_cols.txt" + + // alevin does not have filtered/unfiltered results + mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : '*_alevin_results/af_quant/alevin' + mtx_matrix = "${mtx_dir}/quants_mat.mtx" + barcodes_tsv = "${mtx_dir}/quants_mat_rows.txt" + features_tsv = "${mtx_dir}/quants_mat_cols.txt" + } else if (params.aligner == 'star') { - mtx_matrix = "*.Solo.out/Gene*/filtered/matrix.mtx.gz" - barcodes_tsv = "*.Solo.out/Gene*/filtered/barcodes.tsv.gz" - features_tsv = "*.Solo.out/Gene*/filtered/features.tsv.gz" + + mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : "${input_type}" + suffix = (input_type == 'custom_emptydrops_filter') ? '' : '.gz' + mtx_matrix = "${mtx_dir}/matrix.mtx${suffix}" + barcodes_tsv = "${mtx_dir}/barcodes.tsv${suffix}" + features_tsv = "${mtx_dir}/features.tsv${suffix}" + } // // run script // - if (params.aligner in [ 'cellranger', 'cellrangerarc' ]) + if (params.aligner in [ 'cellranger', 'cellrangerarc' ] && input_type != 'custom_emptydrops_filter') """ # convert file types mtx_to_h5ad.py \\ --aligner cellranger \\ - --input filtered_feature_bc_matrix.h5 \\ + --input ${input_type}_feature_bc_matrix.h5 \\ --sample ${meta.id} \\ - --out ${meta.id}/${meta.id}_matrix.h5ad + --out ${meta.id}/${meta.id}_${input_type}_matrix.h5ad """ else if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') """ # convert file types - for input_type in nascent ambiguous mature ; do + for input_type in ${kb_non_standard_files} ; do mtx_to_h5ad.py \\ --aligner ${params.aligner} \\ --sample ${meta.id} \\ - --input *count/counts_unfiltered/cells_x_genes.\${input_type}.mtx \\ - --barcode $barcodes_tsv \\ - --feature $features_tsv \\ + --input ${matrix} \\ + --barcode ${barcodes_tsv} \\ + --feature ${features_tsv} \\ --txp2gene ${txp2gene} \\ --star_index ${star_index} \\ --out ${meta.id}/${meta.id}_\${input_type}_matrix.h5ad ; @@ -79,7 +127,7 @@ process MTX_TO_H5AD { --feature $features_tsv \\ --txp2gene ${txp2gene} \\ --star_index ${star_index} \\ - --out ${meta.id}/${meta.id}_matrix.h5ad + --out ${meta.id}/${meta.id}_${input_type}_matrix.h5ad """ stub: diff --git a/modules/local/mtx_to_seurat.nf b/modules/local/mtx_to_seurat.nf index 82ee63cd..3ba636ff 100644 --- a/modules/local/mtx_to_seurat.nf +++ b/modules/local/mtx_to_seurat.nf @@ -19,23 +19,76 @@ process MTX_TO_SEURAT { script: def aligner = params.aligner + + + // Get a file to check input type. Some aligners bring arrays instead of a single file. + def input_to_check = (inputs instanceof String) ? inputs : inputs[0] + + // check input type of inputs + def is_emptydrops = '0' + input_type = (input_to_check.toUriString().contains('unfiltered') || input_to_check.toUriString().contains('raw')) ? 'raw' : 'filtered' + if ( params.aligner == 'alevin' ) { input_type = 'raw' } // alevin has its own filtering methods and mostly output a single mtx, raw here means, the base tool output + if (input_to_check.toUriString().contains('emptydrops')) { + input_type = 'custom_emptydrops_filter' + is_emptydrops = '--is_emptydrops' + } + + // def file paths for aligners. Cellranger is normally converted with the .h5 files + // However, the emptydrops call, always generate .mtx files, thus, cellranger 'emptydrops' required a parsing if (params.aligner in [ 'cellranger', 'cellrangerarc' ]) { - matrix = "matrix.mtx.gz" - barcodes = "barcodes.tsv.gz" - features = "features.tsv.gz" - } else if (params.aligner == "kallisto") { - matrix = "*count/counts_unfiltered/*.mtx" - barcodes = "*count/counts_unfiltered/*.barcodes.txt" - features = "*count/counts_unfiltered/*.genes.names.txt" + + mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered/' : '' + matrix = "${mtx_dir}matrix.mtx*" + barcodes = "${mtx_dir}barcodes.tsv*" + features = "${mtx_dir}features.tsv*" + + } else if (params.aligner == 'kallisto') { + + kb_pattern = (input_type == 'raw') ? 'un' : '' + mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : "counts_${kb_pattern}filtered" + if ((input_type == 'custom_emptydrops_filter') && (params.kb_workflow != 'standard')) { mtx_dir = 'emptydrops_filtered/\${input_type}' } // dir has subdirs for non-standard workflows + matrix = "${mtx_dir}/*.mtx" + barcodes = "${mtx_dir}/*.barcodes.txt" + features = "${mtx_dir}/*.genes.names.txt" + + // kallisto allows the following workflows: ["standard", "lamanno", "nac"] + // lamanno creates "spliced" and "unspliced" + // nac creates "nascent", "ambiguous" "mature" + // also, lamanno produces a barcodes and genes file for both spliced and unspliced + // while nac keep only one for all the different .mtx files produced + kb_non_standard_files = "" + if (params.kb_workflow == "lamanno") { + kb_non_standard_files = "spliced unspliced" + matrix = "${mtx_dir}/\${input_type}.mtx" + barcodes = "${mtx_dir}/\${input_type}.barcodes.txt" + features = "${mtx_dir}/\${input_type}.genes.txt" + } + if (params.kb_workflow == "nac") { + kb_non_standard_files = "nascent ambiguous mature" + matrix = "${mtx_dir}/*\${input_type}.mtx" + features = "${mtx_dir}/*.genes.txt" + } // barcodes tsv has same pattern as standard workflow + } else if (params.aligner == "alevin") { - matrix = "*_alevin_results/af_quant/alevin/quants_mat.mtx" - barcodes = "*_alevin_results/af_quant/alevin/quants_mat_rows.txt" - features = "*_alevin_results/af_quant/alevin/quants_mat_cols.txt" + + mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : '*_alevin_results/af_quant/alevin' + matrix = "${mtx_dir}/quants_mat.mtx" + barcodes = "${mtx_dir}/quants_mat_rows.txt" + features = "${mtx_dir}/quants_mat_cols.txt" + } else if (params.aligner == 'star') { - matrix = "*.Solo.out/Gene*/filtered/matrix.mtx.gz" - barcodes = "*.Solo.out/Gene*/filtered/barcodes.tsv.gz" - features = "*.Solo.out/Gene*/filtered/features.tsv.gz" + + mtx_dir = (input_type == 'custom_emptydrops_filter') ? 'emptydrops_filtered' : "${input_type}" + suffix = (input_type == 'custom_emptydrops_filter') ? '' : '.gz' + matrix = "${mtx_dir}/matrix.mtx${suffix}" + barcodes = "${mtx_dir}/barcodes.tsv${suffix}" + features = "${mtx_dir}/features.tsv${suffix}" + } + + // + // run script + // """ mkdir ${meta.id} """ @@ -43,13 +96,14 @@ process MTX_TO_SEURAT { if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') """ # convert file types - for input_type in nascent ambiguous mature ; do + for input_type in ${kb_non_standard_files} ; do mtx_to_seurat.R \\ - *count/counts_unfiltered/cells_x_genes.\${input_type}.mtx \\ - $barcodes \\ - $features \\ + ${matrix} \\ + ${barcodes} \\ + ${features} \\ ${meta.id}/${meta.id}_\${input_type}_matrix.rds \\ - ${aligner} + ${aligner} \\ + ${is_emptydrops} done """ @@ -59,8 +113,9 @@ process MTX_TO_SEURAT { $matrix \\ $barcodes \\ $features \\ - ${meta.id}/${meta.id}_matrix.rds \\ - ${aligner} + ${meta.id}/${meta.id}_${input_type}_matrix.rds \\ + ${aligner} \\ + ${is_emptydrops} """ stub: diff --git a/modules/local/star_align.nf b/modules/local/star_align.nf index a7dfab3d..4b3df1e1 100644 --- a/modules/local/star_align.nf +++ b/modules/local/star_align.nf @@ -21,12 +21,14 @@ process STAR_ALIGN { val other_10x_parameters output: - tuple val(meta), path('*d.out.bam') , emit: bam - tuple val(meta), path('*.Solo.out') , emit: counts - tuple val(meta), path('*Log.final.out') , emit: log_final - tuple val(meta), path('*Log.out') , emit: log_out - tuple val(meta), path('*Log.progress.out'), emit: log_progress - path "versions.yml" , emit: versions + tuple val(meta), path('*d.out.bam') , emit: bam + tuple val(meta), path('*.Solo.out') , emit: counts + tuple val(meta), path ("*.Solo.out/Gene*/raw") , emit: raw_counts + tuple val(meta), path ("*.Solo.out/Gene*/filtered"), emit: filtered_counts + tuple val(meta), path('*Log.final.out') , emit: log_final + tuple val(meta), path('*Log.out') , emit: log_out + tuple val(meta), path('*Log.progress.out') , emit: log_progress + path "versions.yml" , emit: versions tuple val(meta), path('*sortedByCoord.out.bam') , optional:true, emit: bam_sorted tuple val(meta), path('*toTranscriptome.out.bam'), optional:true, emit: bam_transcript diff --git a/modules/nf-core/cellranger/count/environment.yml b/modules/nf-core/cellranger/count/environment.yml deleted file mode 100644 index 662f747d..00000000 --- a/modules/nf-core/cellranger/count/environment.yml +++ /dev/null @@ -1,5 +0,0 @@ -name: cellranger_count -channels: - - conda-forge - - bioconda - - defaults diff --git a/modules/nf-core/cellranger/count/main.nf b/modules/nf-core/cellranger/count/main.nf index 42aa09c9..1811d745 100644 --- a/modules/nf-core/cellranger/count/main.nf +++ b/modules/nf-core/cellranger/count/main.nf @@ -9,8 +9,10 @@ process CELLRANGER_COUNT { path reference output: - tuple val(meta), path("**/outs/**"), emit: outs - path "versions.yml" , emit: versions + tuple val(meta), path("**/outs/**") , emit: outs + tuple val(meta), path("**/outs/filtered_feature_bc_matrix**"), emit: filtered + tuple val(meta), path("**/outs/raw_feature_bc_matrix**") , emit: raw + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -32,7 +34,11 @@ process CELLRANGER_COUNT { def prefix = task.ext.prefix ?: "${meta.id}" """ mkdir -p "${prefix}/outs/" + mkdir -p "${prefix}/outs/filtered_feature_bc_matrix" + mkdir -p "${prefix}/outs/raw_feature_bc_matrix" echo "$prefix" > ${prefix}/outs/fake_file.txt + echo "$prefix" > ${prefix}/outs/filtered_feature_bc_matrix/fake_file.txt + echo "$prefix" > ${prefix}/outs/raw_feature_bc_matrix/fake_file.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/cellranger/count/meta.yml b/modules/nf-core/cellranger/count/meta.yml index 1f1768a8..56244976 100644 --- a/modules/nf-core/cellranger/count/meta.yml +++ b/modules/nf-core/cellranger/count/meta.yml @@ -40,6 +40,14 @@ output: type: file description: Files containing the outputs of Cell Ranger, see official 10X Genomics documentation for a complete list pattern: "${meta.id}/outs/*" + - filtered: + type: file + description: Files containing the filtered outputs of Cell Ranger. + pattern: "**/outs/filtered_feature_bc_matrix**" + - raw: + type: file + description: Files containing the raw outputs of Cell Ranger. + pattern: "**/outs/raw_feature_bc_matrix**" - versions: type: file description: File containing software version diff --git a/modules/nf-core/cellranger/count/tests/main.nf.test.snap b/modules/nf-core/cellranger/count/tests/main.nf.test.snap index 7eafafd0..edfb304b 100644 --- a/modules/nf-core/cellranger/count/tests/main.nf.test.snap +++ b/modules/nf-core/cellranger/count/tests/main.nf.test.snap @@ -33,13 +33,61 @@ "single_end": false, "strandedness": "auto" }, - "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d" + [ + "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d", + "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d", + "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d" + ] ] ], "1": [ + [ + { + "id": "test_10x", + "single_end": false, + "strandedness": "auto" + }, + "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d" + ] + ], + "2": [ + [ + { + "id": "test_10x", + "single_end": false, + "strandedness": "auto" + }, + "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d" + ] + ], + "3": [ "versions.yml:md5,30cee1a9146b01c48d9b1db6bbe813b6" ], + "filtered": [ + [ + { + "id": "test_10x", + "single_end": false, + "strandedness": "auto" + }, + "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d" + ] + ], "outs": [ + [ + { + "id": "test_10x", + "single_end": false, + "strandedness": "auto" + }, + [ + "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d", + "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d", + "fake_file.txt:md5,0d98223c768861fd6af96f00148dbb8d" + ] + ] + ], + "raw": [ [ { "id": "test_10x", @@ -58,6 +106,6 @@ "nf-test": "0.8.4", "nextflow": "23.10.1" }, - "timestamp": "2024-03-05T17:16:12.322822411" + "timestamp": "2024-03-18T11:41:17.258523741" } } \ No newline at end of file diff --git a/modules/nf-core/kallistobustools/count/main.nf b/modules/nf-core/kallistobustools/count/main.nf index 841ea2fe..1efda00a 100644 --- a/modules/nf-core/kallistobustools/count/main.nf +++ b/modules/nf-core/kallistobustools/count/main.nf @@ -17,9 +17,11 @@ process KALLISTOBUSTOOLS_COUNT { val workflow_mode output: - tuple val(meta), path ("*.count") , emit: count - path "versions.yml" , emit: versions - path "*.count/*/*.mtx" , emit: matrix //Ensure that kallisto finished and produced outputs + tuple val(meta), path ("*.count") , emit: count + tuple val(meta), path ("*.count/counts_unfiltered"), emit: raw_counts + tuple val(meta), path ("*.count/counts_filtered") , emit: filtered_counts, optional: true + path "versions.yml" , emit: versions + path "*.count/*/*.mtx" , emit: matrix //Ensure that kallisto finished and produced outputs when: task.ext.when == null || task.ext.when diff --git a/modules/nf-core/kallistobustools/count/meta.yml b/modules/nf-core/kallistobustools/count/meta.yml index 55d5dc6c..d491dffa 100644 --- a/modules/nf-core/kallistobustools/count/meta.yml +++ b/modules/nf-core/kallistobustools/count/meta.yml @@ -58,6 +58,14 @@ output: type: file description: kb count output folder pattern: "*.{count}" + - raw_counts: + type: file + description: kb raw counts output folder + pattern: "*.{count}/counts_unfiltered" + - filtered_counts: + type: file + description: kb filtered counts output folder + pattern: "*.{count}/counts_filtered" - versions: type: file description: File containing software versions diff --git a/modules/nf-core/kallistobustools/count/tests/main.nf.test.snap b/modules/nf-core/kallistobustools/count/tests/main.nf.test.snap index 3378c3c1..6f6b3183 100644 --- a/modules/nf-core/kallistobustools/count/tests/main.nf.test.snap +++ b/modules/nf-core/kallistobustools/count/tests/main.nf.test.snap @@ -15,9 +15,22 @@ ] ], "1": [ - "versions.yml:md5,6ec06270afe0a7572c41567160d927d9" + [ + { + "id": "test" + }, + [ + "cells_x_genes.mtx:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] ], "2": [ + + ], + "3": [ + "versions.yml:md5,6ec06270afe0a7572c41567160d927d9" + ], + "4": [ "cells_x_genes.mtx:md5,d41d8cd98f00b204e9800998ecf8427e" ], "count": [ @@ -31,10 +44,23 @@ ] ] ] + ], + "filtered_counts": [ + ], "matrix": [ "cells_x_genes.mtx:md5,d41d8cd98f00b204e9800998ecf8427e" ], + "raw_counts": [ + [ + { + "id": "test" + }, + [ + "cells_x_genes.mtx:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], "versions": [ "versions.yml:md5,6ec06270afe0a7572c41567160d927d9" ] @@ -44,7 +70,7 @@ "nf-test": "0.8.4", "nextflow": "23.10.1" }, - "timestamp": "2024-03-01T15:48:45.775904562" + "timestamp": "2024-03-18T11:38:48.980939376" }, "genome.fasta + genome.gtf + '10X3' + 'standard'": { "content": [ diff --git a/nextflow.config b/nextflow.config index 54bd4ab3..8fefe42c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -31,6 +31,7 @@ params { kb_workflow = "standard" kb_t1c = null kb_t2c = null + kb_filter = false // STARsolo parameters star_index = null @@ -39,15 +40,18 @@ params { star_feature = "Gene" // Cellranger parameters - cellranger_index = null + cellranger_index = null // Cellranger ARC parameters - motifs = null - cellrangerarc_config = null + motifs = null + cellrangerarc_config = null cellrangerarc_reference = null - // UniverSC paramaters - universc_index = null + // UniverSC parameters + universc_index = null + + // Emptydrops parameters + skip_emptydrops = false // Template Boilerplate options skip_multiqc = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 23f6e9b5..b799d78c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -87,6 +87,10 @@ "skip_fastqc": { "type": "boolean", "description": "Skip FastQC" + }, + "skip_emptydrops": { + "type": "boolean", + "description": "Skip custom empty drops filter module" } } }, @@ -164,7 +168,7 @@ "txp2gene": { "type": "string", "description": "Path to transcript to gene mapping file. This allows the specification of a transcript to gene mapping file for Salmon Alevin and AlevinQC.", - "help_text": "> This is not the same as the `kallisto_gene_map` parameter down below and is only used by the Salmon Alevin workflow.", + "help_text": "> This is only used by the Salmon Alevin workflow.", "fa_icon": "fas fa-map-marked-alt", "format": "file-path", "exists": true @@ -243,6 +247,11 @@ "description": "Type of workflow. Use `nac` for an index type that can quantify nascent and mature RNA. Use `lamanno` for RNA velocity based on La Manno et al. 2018 logic. (default: standard)", "fa_icon": "fas fa-rainbow", "enum": ["standard", "lamanno", "nac"] + }, + "kb_filter": { + "type": "boolean", + "fa_icon": "fas fa-fish", + "description": "Activate Kallisto/BUStools filtering algorithm" } } }, diff --git a/subworkflows/local/align_cellranger.nf b/subworkflows/local/align_cellranger.nf index bfdd533e..39e54675 100644 --- a/subworkflows/local/align_cellranger.nf +++ b/subworkflows/local/align_cellranger.nf @@ -42,6 +42,7 @@ workflow CELLRANGER_ALIGN { emit: ch_versions - cellranger_out = CELLRANGER_COUNT.out.outs - star_index = cellranger_index + cellranger_out = CELLRANGER_COUNT.out.outs + cellranger_matrices = CELLRANGER_COUNT.out.filtered.mix( CELLRANGER_COUNT.out.raw ) + star_index = cellranger_index } diff --git a/subworkflows/local/kallisto_bustools.nf b/subworkflows/local/kallisto_bustools.nf index b6549094..d420ab01 100644 --- a/subworkflows/local/kallisto_bustools.nf +++ b/subworkflows/local/kallisto_bustools.nf @@ -55,7 +55,8 @@ workflow KALLISTO_BUSTOOLS { emit: ch_versions counts = KALLISTOBUSTOOLS_COUNT.out.count - txp2gene - + raw_counts = KALLISTOBUSTOOLS_COUNT.out.raw_counts + filtered_counts = KALLISTOBUSTOOLS_COUNT.out.filtered_counts + txp2gene = txp2gene.collect() } diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf index 958da400..55541f18 100644 --- a/subworkflows/local/mtx_conversion.nf +++ b/subworkflows/local/mtx_conversion.nf @@ -14,14 +14,6 @@ workflow MTX_CONVERSION { main: ch_versions = Channel.empty() - // Cellranger module output contains too many files which cause path collisions, we filter to the ones we need. - if (params.aligner in [ 'cellranger', 'cellrangerarc' ]) { - mtx_matrices = mtx_matrices.map { meta, mtx_files -> - [ meta, mtx_files.findAll { it.toString().contains("filtered_feature_bc_matrix") } ] - } - .filter { meta, mtx_files -> mtx_files } // Remove any that are missing the relevant files - } - // // Convert matrix to h5ad // @@ -34,8 +26,15 @@ workflow MTX_CONVERSION { // // Concat sample-specific h5ad in one // + ch_concat_h5ad_input = MTX_TO_H5AD.out.h5ad.groupTuple() // gather all sample-specific files / per type + if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') { + // when having spliced / unspliced matrices, the collected tuple has two levels ( [[mtx_1, mtx_2]] ) + // which nextflow break because it is not a valid 'path' thus, we have to remove one level + // making it as [ mtx_1, mtx_2 ] + ch_concat_h5ad_input = ch_concat_h5ad_input.map{ type, matrices -> [ type, matrices.flatten().toList() ] } + } CONCAT_H5AD ( - MTX_TO_H5AD.out.h5ad.collect(), // gather all sample-specific files + ch_concat_h5ad_input, samplesheet ) @@ -51,6 +50,6 @@ workflow MTX_CONVERSION { emit: ch_versions - counts = MTX_TO_H5AD.out.counts + // counts = MTX_TO_H5AD.out.counts was this ever used? } diff --git a/subworkflows/local/starsolo.nf b/subworkflows/local/starsolo.nf index e6c95409..0c11acd1 100644 --- a/subworkflows/local/starsolo.nf +++ b/subworkflows/local/starsolo.nf @@ -60,5 +60,7 @@ workflow STARSOLO { star_index = star_index.map{ meta, index -> index } star_result = STAR_ALIGN.out.tab star_counts = STAR_ALIGN.out.counts + raw_counts = STAR_ALIGN.out.raw_counts + filtered_counts = STAR_ALIGN.out.filtered_counts for_multiqc = STAR_ALIGN.out.log_final.map{ meta, it -> it } } diff --git a/tests/main_pipeline_alevin.test b/tests/main_pipeline_alevin.test index 4bbd3cfb..dc6c081a 100644 --- a/tests/main_pipeline_alevin.test +++ b/tests/main_pipeline_alevin.test @@ -43,8 +43,8 @@ nextflow_pipeline { // // Check if files were produced // - {assert new File( "${outputDir}/results_alevin/alevin/mtx_conversions/Sample_X/Sample_X_matrix.h5ad" ).exists()}, - {assert new File( "${outputDir}/results_alevin/alevin/mtx_conversions/Sample_Y/Sample_Y_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_alevin/alevin/mtx_conversions/Sample_X/Sample_X_raw_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_alevin/alevin/mtx_conversions/Sample_Y/Sample_Y_raw_matrix.h5ad" ).exists()}, // // Check if files are the same @@ -56,7 +56,9 @@ nextflow_pipeline { path( "${outputDir}/results_alevin/alevin/Sample_X_alevin_results/af_quant/alevin/quants_mat_rows.txt" ), path( "${outputDir}/results_alevin/alevin/Sample_Y_alevin_results/af_quant/alevin/quants_mat_cols.txt" ), path( "${outputDir}/results_alevin/alevin/Sample_Y_alevin_results/af_quant/alevin/quants_mat.mtx" ), - path( "${outputDir}/results_alevin/alevin/Sample_Y_alevin_results/af_quant/alevin/quants_mat_rows.txt" ) + path( "${outputDir}/results_alevin/alevin/Sample_Y_alevin_results/af_quant/alevin/quants_mat_rows.txt" ), + path( "${outputDir}/results_alevin/alevin/mtx_conversions/Sample_X/Sample_X_raw_matrix.rds" ), + path( "${outputDir}/results_alevin/alevin/mtx_conversions/Sample_Y/Sample_Y_raw_matrix.rds" ) ).match()} ) // end of assertAll() diff --git a/tests/main_pipeline_alevin.test.snap b/tests/main_pipeline_alevin.test.snap index 1ea49820..e3d19f72 100644 --- a/tests/main_pipeline_alevin.test.snap +++ b/tests/main_pipeline_alevin.test.snap @@ -25,8 +25,14 @@ "quants_mat_rows.txt:md5,6227df5a13127b71c71fb18cd8574857", "quants_mat_cols.txt:md5,e9868982c17a330392e38c2a5933cf97", "quants_mat.mtx:md5,54cd12666016adce94c025b2e07f4b02", - "quants_mat_rows.txt:md5,6b458a7777260ba90eccbe7919df934b" + "quants_mat_rows.txt:md5,6b458a7777260ba90eccbe7919df934b", + "Sample_X_raw_matrix.rds:md5,ad35ee66bf2fc3d5d4656c19a7e64e2b", + "Sample_Y_raw_matrix.rds:md5,baf584142205b1d42bb6fdab1f22a06a" ], - "timestamp": "2024-01-19T10:28:35.652763852" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-14T14:49:46.831540515" } } diff --git a/tests/main_pipeline_cellranger.test b/tests/main_pipeline_cellranger.test index b7b34fa5..ea68eca6 100644 --- a/tests/main_pipeline_cellranger.test +++ b/tests/main_pipeline_cellranger.test @@ -30,12 +30,12 @@ nextflow_pipeline { {assert workflow.success}, // How many tasks were executed? - {assert workflow.trace.tasks().size() == 13}, + {assert workflow.trace.tasks().size() == 18}, // How many results were produced? {assert path("${outputDir}/results_cellranger").list().size() == 4}, {assert path("${outputDir}/results_cellranger/cellranger").list().size() == 4}, - {assert path("${outputDir}/results_cellranger/cellranger/mtx_conversions").list().size() == 4}, + {assert path("${outputDir}/results_cellranger/cellranger/mtx_conversions").list().size() == 5}, {assert path("${outputDir}/results_cellranger/cellranger/count").list().size() == 3}, {assert path("${outputDir}/results_cellranger/fastqc").list().size() == 12}, {assert path("${outputDir}/results_cellranger/multiqc").list().size() == 3}, @@ -43,8 +43,10 @@ nextflow_pipeline { // // Check if files were produced // - {assert new File( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_X/Sample_X_matrix.h5ad" ).exists()}, - {assert new File( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_Y/Sample_Y_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_X/Sample_X_raw_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_Y/Sample_Y_raw_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_X/Sample_X_filtered_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_Y/Sample_Y_filtered_matrix.h5ad" ).exists()}, // // Check if files are the same @@ -63,8 +65,10 @@ nextflow_pipeline { path( "${outputDir}/results_cellranger/cellranger/count/Sample_Y/outs/raw_feature_bc_matrix/barcodes.tsv.gz" ), path( "${outputDir}/results_cellranger/cellranger/count/Sample_Y/outs/raw_feature_bc_matrix/features.tsv.gz" ), path( "${outputDir}/results_cellranger/cellranger/count/Sample_Y/outs/raw_feature_bc_matrix/matrix.mtx.gz" ), - path( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_X/Sample_X_matrix.rds" ), - path( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_Y/Sample_Y_matrix.rds" ) + path( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_X/Sample_X_raw_matrix.rds" ), + path( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_Y/Sample_Y_raw_matrix.rds" ), + path( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_X/Sample_X_filtered_matrix.rds" ), + path( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_Y/Sample_Y_filtered_matrix.rds" ) ).match()} ) // end of assertAll() diff --git a/tests/main_pipeline_cellranger.test.snap b/tests/main_pipeline_cellranger.test.snap index 865d651b..7767964f 100644 --- a/tests/main_pipeline_cellranger.test.snap +++ b/tests/main_pipeline_cellranger.test.snap @@ -14,8 +14,8 @@ "errorMessage": "", "trace": { "tasksFailed": 0, - "tasksCount": 13, - "tasksSucceeded": 13 + "tasksCount": 18, + "tasksSucceeded": 18 }, "name": "workflow", "success": true @@ -32,9 +32,15 @@ "barcodes.tsv.gz:md5,081f72b5252ccaf5ffd535ffbd235c4c", "features.tsv.gz:md5,99e453cb1443a3e43e99405184e51a5e", "matrix.mtx.gz:md5,a4db04e43e650accc96361a287126a6b", - "Sample_X_matrix.rds:md5,f9191ba575a3ab79ada4807715f18573", - "Sample_Y_matrix.rds:md5,7be3f7b29d668dcf7e951b9f4d371a5e" + "Sample_X_raw_matrix.rds:md5,306a5477ace4d43d851b8389fdfeaf1f", + "Sample_Y_raw_matrix.rds:md5,74b31532da4cae5a8197d690021d77fc", + "Sample_X_filtered_matrix.rds:md5,f9191ba575a3ab79ada4807715f18573", + "Sample_Y_filtered_matrix.rds:md5,7be3f7b29d668dcf7e951b9f4d371a5e" ], - "timestamp": "2024-01-22T15:19:20.134275449" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-14T15:10:31.638485641" } } diff --git a/tests/main_pipeline_kallisto.test b/tests/main_pipeline_kallisto.test index ae6675b1..12e78144 100644 --- a/tests/main_pipeline_kallisto.test +++ b/tests/main_pipeline_kallisto.test @@ -44,8 +44,8 @@ nextflow_pipeline { // // Check if files were produced // - {assert new File( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_X/Sample_X_matrix.h5ad" ).exists()}, - {assert new File( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_Y/Sample_Y_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_X/Sample_X_raw_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_Y/Sample_Y_raw_matrix.h5ad" ).exists()}, // // Check if files are the same @@ -58,8 +58,8 @@ nextflow_pipeline { path( "${outputDir}/results_kallisto/kallisto/Sample_Y.count/counts_unfiltered/cells_x_genes.barcodes.txt" ), path( "${outputDir}/results_kallisto/kallisto/Sample_Y.count/counts_unfiltered/cells_x_genes.genes.txt" ), path( "${outputDir}/results_kallisto/kallisto/Sample_Y.count/counts_unfiltered/cells_x_genes.mtx" ), - path( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_X/Sample_X_matrix.rds" ), - path( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_Y/Sample_Y_matrix.rds" ) + path( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_X/Sample_X_raw_matrix.rds" ), + path( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_Y/Sample_Y_raw_matrix.rds" ) ).match()} ) // end of assertAll() diff --git a/tests/main_pipeline_kallisto.test.snap b/tests/main_pipeline_kallisto.test.snap index 1eb15749..f9b9c96a 100644 --- a/tests/main_pipeline_kallisto.test.snap +++ b/tests/main_pipeline_kallisto.test.snap @@ -3,13 +3,13 @@ "content": [ { "stderr": [ - + ], "errorReport": "", "exitStatus": 0, "failed": false, "stdout": [ - + ], "errorMessage": "", "trace": { @@ -26,9 +26,13 @@ "cells_x_genes.barcodes.txt:md5,a8cf7ea4b2d075296a94bf066a64b7a4", "cells_x_genes.genes.txt:md5,acd9d00120f52031974b2add3e7521b6", "cells_x_genes.mtx:md5,abd83de117204d0a77df3c92d00cc025", - "Sample_X_matrix.rds:md5,0938f4189b7a7fd1030abfcee798741c", - "Sample_Y_matrix.rds:md5,93c12abe283ab37c5f37e5cd3cb25302" + "Sample_X_raw_matrix.rds:md5,0938f4189b7a7fd1030abfcee798741c", + "Sample_Y_raw_matrix.rds:md5,93c12abe283ab37c5f37e5cd3cb25302" ], - "timestamp": "2024-02-27T12:19:47.921508953" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T14:51:42.040931572" } -} +} \ No newline at end of file diff --git a/tests/main_pipeline_star.test b/tests/main_pipeline_star.test index 111c5243..37c54d57 100644 --- a/tests/main_pipeline_star.test +++ b/tests/main_pipeline_star.test @@ -30,20 +30,22 @@ nextflow_pipeline { {assert workflow.success}, // How many tasks were executed? - {assert workflow.trace.tasks().size() == 12}, + {assert workflow.trace.tasks().size() == 17}, // How many results were produced? {assert path("${outputDir}/results_star").list().size() == 4}, {assert path("${outputDir}/results_star/star").list().size() == 3}, - {assert path("${outputDir}/results_star/star/mtx_conversions").list().size() == 4}, + {assert path("${outputDir}/results_star/star/mtx_conversions").list().size() == 5}, {assert path("${outputDir}/results_star/fastqc").list().size() == 12}, {assert path("${outputDir}/results_star/multiqc").list().size() == 3}, // // Check if files were produced // - {assert new File( "${outputDir}/results_star/star/mtx_conversions/Sample_X/Sample_X_matrix.h5ad" ).exists()}, - {assert new File( "${outputDir}/results_star/star/mtx_conversions/Sample_Y/Sample_Y_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_star/star/mtx_conversions/Sample_X/Sample_X_raw_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_star/star/mtx_conversions/Sample_Y/Sample_Y_raw_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_star/star/mtx_conversions/Sample_X/Sample_X_filtered_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_star/star/mtx_conversions/Sample_Y/Sample_Y_filtered_matrix.h5ad" ).exists()}, // // Check if files are the same @@ -55,15 +57,15 @@ nextflow_pipeline { path( "${outputDir}/results_star/star/Sample_X/Sample_X.Solo.out/Gene/filtered/matrix.mtx.gz" ), path( "${outputDir}/results_star/star/Sample_X/Sample_X.Solo.out/Gene/filtered/features.tsv.gz" ), path( "${outputDir}/results_star/star/Sample_X/Sample_X.Solo.out/Gene/filtered/barcodes.tsv.gz" ), - // path( "${outputDir}/results_star/star/mtx_conversions/Sample_X/Sample_X_matrix.h5ad" ), // does not match - path( "${outputDir}/results_star/star/mtx_conversions/Sample_X/Sample_X_matrix.rds" ), path( "${outputDir}/results_star/star/Sample_Y/Sample_Y.SJ.out.tab" ), path( "${outputDir}/results_star/star/Sample_Y/Sample_Y.Solo.out/Barcodes.stats" ), path( "${outputDir}/results_star/star/Sample_Y/Sample_Y.Solo.out/Gene/filtered/matrix.mtx.gz" ), path( "${outputDir}/results_star/star/Sample_Y/Sample_Y.Solo.out/Gene/filtered/features.tsv.gz" ), path( "${outputDir}/results_star/star/Sample_Y/Sample_Y.Solo.out/Gene/filtered/barcodes.tsv.gz" ), - // path( "${outputDir}/results_star/star/mtx_conversions/Sample_Y/Sample_Y_matrix.h5ad" ), // does not match - path( "${outputDir}/results_star/star/mtx_conversions/Sample_Y/Sample_Y_matrix.rds" ), + path( "${outputDir}/results_star/star/mtx_conversions/Sample_X/Sample_X_raw_matrix.rds" ), + path( "${outputDir}/results_star/star/mtx_conversions/Sample_Y/Sample_Y_raw_matrix.rds" ), + path( "${outputDir}/results_star/star/mtx_conversions/Sample_X/Sample_X_filtered_matrix.rds" ), + path( "${outputDir}/results_star/star/mtx_conversions/Sample_Y/Sample_Y_filtered_matrix.rds" ), ).match()} ) // end of assertAll() diff --git a/tests/main_pipeline_star.test.snap b/tests/main_pipeline_star.test.snap index eaa97b8b..0aae74cf 100644 --- a/tests/main_pipeline_star.test.snap +++ b/tests/main_pipeline_star.test.snap @@ -14,8 +14,8 @@ "errorMessage": "", "trace": { "tasksFailed": 0, - "tasksCount": 12, - "tasksSucceeded": 12 + "tasksCount": 17, + "tasksSucceeded": 17 }, "name": "workflow", "success": true @@ -25,14 +25,20 @@ "matrix.mtx.gz:md5,6a923393343aa1a69b0cf1bd998c9285", "features.tsv.gz:md5,99e453cb1443a3e43e99405184e51a5e", "barcodes.tsv.gz:md5,9a7dacaa1779ea43c1507a947fe6992a", - "Sample_X_matrix.rds:md5,aa2d36dd8507aba864347c88e4ce0d27", "Sample_Y.SJ.out.tab:md5,98bd31104a860cf80119dc30d938d163", "Barcodes.stats:md5,2dbf1ae426c1afd97903ee001f0db5ce", "matrix.mtx.gz:md5,0ae080bd0002e350531a5816e159345e", "features.tsv.gz:md5,99e453cb1443a3e43e99405184e51a5e", "barcodes.tsv.gz:md5,9b695b0b91bcb146ec9c4688ca10a690", - "Sample_Y_matrix.rds:md5,d459af8f99258bcc88b80b2f7c58e911" + "Sample_X_raw_matrix.rds:md5,31604db3e7846acc8d9a60b1a171ce78", + "Sample_Y_raw_matrix.rds:md5,1a52c823e91acce2b29621c8c99c8c72", + "Sample_X_filtered_matrix.rds:md5,aa2d36dd8507aba864347c88e4ce0d27", + "Sample_Y_filtered_matrix.rds:md5,d459af8f99258bcc88b80b2f7c58e911" ], - "timestamp": "2024-01-19T15:46:22.470527538" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-14T16:30:25.7971791" } } diff --git a/tests/nextflow.config b/tests/nextflow.config index 66e56d29..7efd4642 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -22,6 +22,9 @@ params { gtf = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/reference/gencode.vM19.annotation.chr19.gtf' protocol = '10XV2' + // small dataset does not have sufficient data for emptydrops module + skip_emptydrops = true + validationSchemaIgnoreParams = 'genomes' } diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 64d75390..7171c970 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -1,17 +1,17 @@ -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { FASTQC_CHECK } from '../subworkflows/local/fastqc' -include { KALLISTO_BUSTOOLS } from '../subworkflows/local/kallisto_bustools' -include { SCRNASEQ_ALEVIN } from '../subworkflows/local/alevin' -include { STARSOLO } from '../subworkflows/local/starsolo' -include { CELLRANGER_ALIGN } from "../subworkflows/local/align_cellranger" -include { CELLRANGERARC_ALIGN } from "../subworkflows/local/align_cellrangerarc" -include { UNIVERSC_ALIGN } from "../subworkflows/local/align_universc" -include { MTX_CONVERSION } from "../subworkflows/local/mtx_conversion" -include { GTF_GENE_FILTER } from '../modules/local/gtf_gene_filter' - -include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_scrnaseq_pipeline' +include { MULTIQC } from '../modules/nf-core/multiqc/main' +include { FASTQC_CHECK } from '../subworkflows/local/fastqc' +include { KALLISTO_BUSTOOLS } from '../subworkflows/local/kallisto_bustools' +include { SCRNASEQ_ALEVIN } from '../subworkflows/local/alevin' +include { STARSOLO } from '../subworkflows/local/starsolo' +include { CELLRANGER_ALIGN } from "../subworkflows/local/align_cellranger" +include { CELLRANGERARC_ALIGN } from "../subworkflows/local/align_cellrangerarc" +include { UNIVERSC_ALIGN } from "../subworkflows/local/align_universc" +include { MTX_CONVERSION } from "../subworkflows/local/mtx_conversion" +include { GTF_GENE_FILTER } from '../modules/local/gtf_gene_filter' +include { EMPTYDROPS_CELL_CALLING } from '../modules/local/emptydrops' +include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_scrnaseq_pipeline' include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' @@ -19,6 +19,8 @@ workflow SCRNASEQ { take: ch_fastq + ch_genome_fasta + ch_gtf main: @@ -27,9 +29,11 @@ workflow SCRNASEQ { error "Only cellranger supports `protocol = 'auto'`. Please specify the protocol manually!" } + // overwrite fasta and gtf if user provide a custom one + ch_genome_fasta = Channel.value(params.fasta ? file(params.fasta) : ch_genome_fasta) + ch_gtf = Channel.value(params.gtf ? file(params.gtf) : ch_gtf) + // general input and params - ch_genome_fasta = Channel.value(params.fasta ? file(params.fasta) : []) - ch_gtf = params.gtf ? file(params.gtf) : [] ch_transcript_fasta = params.transcript_fasta ? file(params.transcript_fasta): [] ch_motifs = params.motifs ? file(params.motifs) : [] ch_cellrangerarc_config = params.cellrangerarc_config ? file(params.cellrangerarc_config) : [] @@ -96,7 +100,7 @@ workflow SCRNASEQ { ch_fastq ) ch_versions = ch_versions.mix(KALLISTO_BUSTOOLS.out.ch_versions) - ch_mtx_matrices = ch_mtx_matrices.mix(KALLISTO_BUSTOOLS.out.counts) + ch_mtx_matrices = ch_mtx_matrices.mix(KALLISTO_BUSTOOLS.out.raw_counts, KALLISTO_BUSTOOLS.out.filtered_counts) ch_txp2gene = KALLISTO_BUSTOOLS.out.txp2gene } @@ -130,7 +134,7 @@ workflow SCRNASEQ { protocol_config.get('extra_args', ""), ) ch_versions = ch_versions.mix(STARSOLO.out.ch_versions) - ch_mtx_matrices = ch_mtx_matrices.mix(STARSOLO.out.star_counts) + ch_mtx_matrices = ch_mtx_matrices.mix(STARSOLO.out.raw_counts, STARSOLO.out.filtered_counts) ch_star_index = STARSOLO.out.star_index ch_multiqc_files = ch_multiqc_files.mix(STARSOLO.out.for_multiqc) } @@ -145,7 +149,7 @@ workflow SCRNASEQ { protocol_config['protocol'] ) ch_versions = ch_versions.mix(CELLRANGER_ALIGN.out.ch_versions) - ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_ALIGN.out.cellranger_out) + ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_ALIGN.out.cellranger_matrices) ch_star_index = CELLRANGER_ALIGN.out.star_index ch_multiqc_files = ch_multiqc_files.mix(CELLRANGER_ALIGN.out.cellranger_out.map{ meta, outs -> outs.findAll{ it -> it.name == "web_summary.html"} @@ -179,6 +183,26 @@ workflow SCRNASEQ { ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGERARC_ALIGN.out.cellranger_arc_out) } + // Run emptydrops calling module + if ( !params.skip_emptydrops ) { + + // + // emptydrops should only run on the raw matrices thus, filter-out the filtered result of the aligners that can produce it + // + if ( params.aligner in [ 'cellranger', 'cellrangerarc', 'kallisto', 'star' ] ) { + ch_mtx_matrices_for_emptydrops = + ch_mtx_matrices.filter { meta, mtx_files -> + mtx_files.toString().contains("raw_feature_bc_matrix") || // cellranger + mtx_files.toString().contains("counts_unfiltered") || // kallisto + mtx_files.toString().contains("raw") // star + } + } else { + ch_mtx_matrices_for_emptydrops = ch_mtx_matrices + } + EMPTYDROPS_CELL_CALLING( ch_mtx_matrices_for_emptydrops ) + ch_mtx_matrices = ch_mtx_matrices.mix( EMPTYDROPS_CELL_CALLING.out.filtered_matrices ) + } + // Run mtx to h5ad conversion subworkflow MTX_CONVERSION ( ch_mtx_matrices,