diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 47d1b446..bd713438 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -87,7 +87,8 @@ def check_samplesheet(file_in, file_out): ## Check header MIN_COLS = 2 MIN_HEADER = ["sample", "fastq_1", "fastq_2"] - OPT_HEADER = ["expected_cells", "seq_center"] + OPT_HEADER = ["expected_cells", "seq_center", "fastq_barcode", "sample_type"] + SAMPLE_TYPES = ["gex", "atac"] header = [x.strip('"') for x in fin.readline().strip().split(",")] unknown_header = 0 @@ -101,8 +102,7 @@ def check_samplesheet(file_in, file_out): min_header_count = min_header_count + 1 colmap[h] = i i = i + 1 - if min_header_count < len(MIN_HEADER): - # code was checking for unknown_header or min_header_count however looking at the ifelse, unknown_header does not seem that it should be tested + if unknown_header or min_header_count < len(MIN_HEADER): given = ",".join(header) wanted = ",".join(MIN_HEADER) print(f"ERROR: Please check samplesheet header -> {given} != {wanted}") @@ -147,7 +147,26 @@ def check_samplesheet(file_in, file_out): seq_center = seq_center.replace(" ", "_") ## Check FastQ file extension - for fastq in [fastq_1, fastq_2]: + fastq_list = [fastq_1, fastq_2] + + fastq_barcode = "" + if "fastq_barcode" in header: + fastq_barcode = lspl[colmap["fastq_barcode"]] + fastq_list.append(fastq_barcode) + + sample_type = "" + if "sample_type" in header: + sample_type = lspl[colmap["sample_type"]] + if sample_type not in SAMPLE_TYPES: + print_error( + "Sample type {} is not supported! Please specify either {}".format( + sample_type, " or ".join(SAMPLE_TYPES) + ), + "Line", + line, + ) + + for fastq in fastq_list: if fastq: if fastq.find(" ") != -1: print_error("FastQ file contains spaces!", "Line", line) @@ -161,9 +180,9 @@ def check_samplesheet(file_in, file_out): ## Auto-detect paired-end/single-end sample_info = [] ## [single_end, fastq_1, fastq_2] if sample and fastq_1 and fastq_2: ## Paired-end short reads - sample_info = ["0", fastq_1, fastq_2, expected_cells, seq_center] + sample_info = ["0", fastq_1, fastq_2, expected_cells, seq_center, fastq_barcode, sample_type] elif sample and fastq_1 and not fastq_2: ## Single-end short reads - sample_info = ["1", fastq_1, fastq_2, expected_cells, seq_center] + sample_info = ["1", fastq_1, fastq_2, expected_cells, seq_center, fastq_barcode, sample_type] else: print_error("Invalid combination of columns provided!", "Line", line) @@ -180,7 +199,21 @@ def check_samplesheet(file_in, file_out): ## Write validated samplesheet with appropriate columns if len(sample_mapping_dict) > 0: with open(file_out, "w") as fout: - fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2", "expected_cells", "seq_center"]) + "\n") + fout.write( + ",".join( + [ + "sample", + "single_end", + "fastq_1", + "fastq_2", + "expected_cells", + "seq_center", + "fastq_barcode", + "sample_type", + ] + ) + + "\n" + ) for sample in sorted(sample_mapping_dict.keys()): ## Check that multiple runs of the same sample are of the same datatype if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]): diff --git a/bin/generate_lib_csv.py b/bin/generate_lib_csv.py new file mode 100755 index 00000000..5c1c0c4f --- /dev/null +++ b/bin/generate_lib_csv.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +import argparse +import os + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Generate the lib.csv for cellranger-arc.") + + parser.add_argument("-t", "--sample_types", dest="sample_types", help="Comma seperated list of sample types.") + parser.add_argument("-n", "--sample_names", dest="sample_names", help="Comma seperated list of sample names.") + parser.add_argument("-f", "--fastq_folder", dest="fastq_folder", help="Folder of FASTQ files.") + parser.add_argument("-o", "--out", dest="out", help="Output path.") + + args = vars(parser.parse_args()) + + print(args) + + sample_types = args["sample_types"].split(",") + sample_names = args["sample_names"].split(",") + unique_samples_names = set(sample_names) + + lib_csv = open(args["out"], "w") + lib_csv.write("fastqs,sample,library_type") + + for i in range(0, len(sample_types)): + if sample_names[i] in unique_samples_names: + unique_samples_names.remove( + sample_names[i] + ) # this has to be done to account for different Lane files (e.g., L002) + if sample_types[i] == "gex": + lib_csv.write("\n{},{},{}".format(args["fastq_folder"], sample_names[i], "Gene Expression")) + else: + lib_csv.write("\n{},{},{}".format(args["fastq_folder"], sample_names[i], "Chromatin Accessibility")) + + lib_csv.close() + + print("Wrote lib.csv file to {}".format(args["out"])) diff --git a/conf/modules.config b/conf/modules.config index f4cea8ad..3bd0631b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -83,6 +83,32 @@ if(params.aligner == "cellranger") { } } +if(params.aligner == "cellrangerarc") { + process { + withName: CELLRANGERARC_MKGTF { + publishDir = [ + path: "${params.outdir}/${params.aligner}/mkgtf", + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = "--attribute=gene_biotype:protein_coding --attribute=gene_biotype:lncRNA --attribute=gene_biotype:pseudogene" + } + withName: CELLRANGERARC_MKREF { + publishDir = [ + path: "${params.outdir}/${params.aligner}/mkref", + mode: params.publish_dir_mode + ] + } + withName: CELLRANGERARC_COUNT { + publishDir = [ + path: "${params.outdir}/${params.aligner}/count", + mode: params.publish_dir_mode + ] + ext.args = {meta.expected_cells ? "--expect-cells ${meta.expected_cells}" : ''} + } + } +} + if(params.aligner == "universc") { process { publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } diff --git a/docs/output.md b/docs/output.md index c1e2b013..7e9f0cd8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -17,6 +17,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [STARsolo](#starsolo) - [Salmon Alevin & AlevinQC](#salmon-alevin--alevinqc) - [Cellranger](#cellranger) + - [Cellranger ARC](#cellranger-arc) - [UniverSC](#universc) - [Other output data](#other-output-data) - [MultiQC](#multiqc) @@ -103,6 +104,14 @@ Cell Ranger is a set of analysis scripts that processes 10X Chromium single cell - Contains the mapped BAM files, filtered and unfiltered HDF5 matrices and output metrics created by Cellranger +## Cellranger ARC + +Cell Ranger ARC is a set of analysis pipelines that process Chromium Single Cell Multiome ATAC + Gene Expression sequencing data to generate a variety of analyses pertaining to gene expression (GEX), chromatin accessibility, and their linkage. Furthermore, since the ATAC and GEX measurements are on the very same cell, we are able to perform analyses that link chromatin accessibility and GEX. See [Cellranger ARC](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc) for more information on Cellranger. + +**Output directory: `results/cellrangerarc`** + +- Contains the mapped BAM files, filtered and unfiltered HDF5 matrices and output metrics created by Cellranger ARC + ## UniverSC UniverSC is a wrapper that calls an open-source implementation of Cell Ranger v3.0.2 and adjusts run parameters for compatibility with a wide ranger of technologies. diff --git a/docs/usage.md b/docs/usage.md index 2b0b9ce2..ee5d9812 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -111,6 +111,53 @@ See the [UniverSC GitHub page](https://github.com/minoda-lab/universc#pre-set-co Currently only 3\' scRNA-Seq parameters are supported in nextflow, although chemistry parameters for 5\' scRNA-Seq and full-length scRNA-Seq libraries are supported by teh container. +### If using cellranger-arc + +#### Automatic file name detection + +This pipeline currently **does not** automatically renames input FASTQ files to follow the +[naming convention by 10x](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input): + +``` +[Sample Name]_S1_L00[Lane Number]_[Read Type]_001.fastq.gz +``` + +Thus please make sure your files follow this naming convention. + +#### Sample sheet definition + +If you are using cellranger-arc you have to add the column _sample_type_ (atac for scATAC or gex for scRNA) and _fastq_barcode_ (part of the scATAC data) to your samplesheet as an input. + +**Beware of the following points:** + +- It is important that you give your scRNA and scATAC different [Sample Name]s. +- Check first which file is your barcode fastq file for your scATAC data ([see](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/using/using/fastq-input)). +- If you have more than one sequencing run then you have to give them another suffix (e.g., rep\*) to your [Sample Name] ([see](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/using/fastq-input#atac_quick_start)). + +An example samplesheet for a dataset called test_scARC that has two sequencing runs for the scATAC and one seqeuncing run +from two lanes for the scRNA could look like this: + +```csv +sample,fastq_1,fastq_2,fastq_barcode,sample_type +test_scARC,path/test_scARC_atac_rep1_S1_L001_R1_001.fastq.gz,path/test_scARC_atac_rep1_S1_L001_R2_001.fastq.gz,path/test_scARC_atac_rep1_S1_L001_I2_001.fastq.gz,atac +test_scARC,path/test_scARC_atac_rep2_S2_L001_R1_001.fastq.gz,path/test_scARC_atac_rep2_S2_L001_R2_001.fastq.gz,path/test_scARC_atac_rep2_S2_L001_I2_001.fastq.gz,atac +test_scARC,path/test_scARC_gex_S1_L001_R1_001.fastq.gz,path/test_scARC_gex_S1_L001_R2_001.fastq.gz,,gex +test_scARC,path/test_scARC_gex_S1_L002_R1_001.fastq.gz,path/test_scARC_gex_S1_L002_R2_001.fastq.gz,,gex +``` + +#### Config file and index + +Cellranger-arc needs a reference index directory that you can provide with `--cellranger_index`. Be aware, you can use +for cellranger-arc the same index you use for cellranger ([see](https://kb.10xgenomics.com/hc/en-us/articles/4408281606797-Are-the-references-interchangeable-between-pipelines)). +Yet, a cellranger-arc index might include additional data (e.g., TF binding motifs). Therefore, please first check if +you have to create a new cellranger-arc index ([see here](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/advanced/references) for +more information) + +If you decide to create a cellranger-arc index, then you need to create a config file to generate the index. The pipeline +can do this autmatically for you if you provide a `--fasta`, `--gtf`, and an optional `--motif` file. However, you can +also decide to provide your own config file with `--cellrangerarc_config`, then you also have to specify with `--cellrangerarc_reference` +the reference genome name that you have used and stated as _genome:_ in your config file. + ## Running the pipeline The minimum typical command for running the pipeline is as follows: diff --git a/modules.json b/modules.json index a05f65d8..1119657a 100644 --- a/modules.json +++ b/modules.json @@ -64,6 +64,11 @@ "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] + }, + "unzip": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] } } } diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf index 7961e057..84d98608 100644 --- a/modules/local/mtx_to_h5ad.nf +++ b/modules/local/mtx_to_h5ad.nf @@ -41,11 +41,11 @@ process MTX_TO_H5AD { // // run script // - if (params.aligner == 'cellranger') + if (params.aligner in [ 'cellranger', 'cellrangerarc' ]) """ # convert file types mtx_to_h5ad.py \\ - --aligner ${params.aligner} \\ + --aligner cellranger \\ --input filtered_feature_bc_matrix.h5 \\ --sample ${meta.id} \\ --out ${meta.id}/${meta.id}_matrix.h5ad diff --git a/modules/local/mtx_to_seurat.nf b/modules/local/mtx_to_seurat.nf index 4351f4b3..73e260d2 100644 --- a/modules/local/mtx_to_seurat.nf +++ b/modules/local/mtx_to_seurat.nf @@ -19,10 +19,10 @@ process MTX_TO_SEURAT { script: def aligner = params.aligner - if (params.aligner == "cellranger") { - matrix = "matrix.mtx.gz" - barcodes = "barcodes.tsv.gz" - features = "features.tsv.gz" + if (params.aligner in [ 'cellranger', 'cellrangerarc' ]) { + matrix = "filtered_feature_bc_matrix/matrix.mtx.gz" + barcodes = "filtered_feature_bc_matrix/barcodes.tsv.gz" + features = "filtered_feature_bc_matrix/features.tsv.gz" } else if (params.aligner == "kallisto") { matrix = "*count/counts_unfiltered/*.mtx" barcodes = "*count/counts_unfiltered/*.barcodes.txt" diff --git a/modules/nf-core/cellranger/count/templates/cellranger_count.py b/modules/nf-core/cellranger/count/templates/cellranger_count.py deleted file mode 100644 index 4bfb9f4f..00000000 --- a/modules/nf-core/cellranger/count/templates/cellranger_count.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python3 -""" -Automatically rename staged files for input into cellranger count. - -Copyright (c) Gregor Sturm 2023 - MIT License -""" -from subprocess import run -from pathlib import Path -from textwrap import dedent -import shlex -import re - - -def chunk_iter(seq, size): - """iterate over `seq` in chunks of `size`""" - return (seq[pos : pos + size] for pos in range(0, len(seq), size)) - - -sample_id = "${meta.id}" - -# get fastqs, ordered by path. Files are staged into -# - "fastq_001/{original_name.fastq.gz}" -# - "fastq_002/{oritinal_name.fastq.gz}" -# - ... -# Since we require fastq files in the input channel to be ordered such that a R1/R2 pair -# of files follows each other, ordering will get us a sequence of [R1, R2, R1, R2, ...] -fastqs = sorted(Path(".").glob("fastq_*/*")) -assert len(fastqs) % 2 == 0 - -# target directory in which the renamed fastqs will be placed -fastq_all = Path("./fastq_all") -fastq_all.mkdir(exist_ok=True) - -# Match R1 in the filename, but only if it is followed by a non-digit or non-character -# match "file_R1.fastq.gz", "file.R1_000.fastq.gz", etc. but -# do not match "SRR12345", "file_INFIXR12", etc -filename_pattern = r"([^a-zA-Z0-9])R1([^a-zA-Z0-9])" - -for i, (r1, r2) in enumerate(chunk_iter(fastqs, 2)): - # double escapes are required because nextflow processes this python 'template' - if re.sub(filename_pattern, r"\\1R2\\2", r1.name) != r2.name: - raise AssertionError( - dedent( - f"""\ - We expect R1 and R2 of the same sample to have the same filename except for R1/R2. - This has been checked by replacing "R1" with "R2" in the first filename and comparing it to the second filename. - If you believe this check shouldn't have failed on your filenames, please report an issue on GitHub! - - Files involved: - - {r1} - - {r2} - """ - ) - ) - r1.rename(fastq_all / f"{sample_id}_S1_L{i:03d}_R1_001.fastq.gz") - r2.rename(fastq_all / f"{sample_id}_S1_L{i:03d}_R2_001.fastq.gz") - -run( - # fmt: off - [ - "cellranger", "count", - "--id", "${prefix}", - "--fastqs", str(fastq_all), - "--transcriptome", "${reference.name}", - "--localcores", "${task.cpus}", - "--localmem", "${task.memory.toGiga()}", - *shlex.split("""${args}""") - ], - # fmt: on - check=True, -) - -# Output version information -version = run( - ["cellranger", "-V"], - text=True, - check=True, - capture_output=True, -).stdout.replace("cellranger cellranger-", "") - -# alas, no `pyyaml` pre-installed in the cellranger container -with open("versions.yml", "w") as f: - f.write('"${task.process}":\\n') - f.write(f' cellranger: "{version}"\\n') diff --git a/modules/nf-core/cellrangerarc/Dockerfile b/modules/nf-core/cellrangerarc/Dockerfile new file mode 100644 index 00000000..812b64ba --- /dev/null +++ b/modules/nf-core/cellrangerarc/Dockerfile @@ -0,0 +1,28 @@ +# Dockerfile to create container with Cell Ranger v2.0.2 +# Push to quay.io/nf-core/cellranger-arc: + +FROM continuumio/miniconda3:4.8.2 +LABEL authors="Gisela Gabernet , Florian Heyl" \ + description="Docker image containing Cell Ranger Arc" +# Disclaimer: this container is not provided nor supported by Illumina or 10x Genomics. + +# Install procps and clean apt cache +RUN apt-get update --allow-releaseinfo-change \ + && apt-get install -y \ + cpio \ + procps \ + rpm2cpio \ + unzip \ + && apt-get clean -y && rm -rf /var/lib/apt/lists/* + +# Copy pre-downloaded cellranger-arc file +ENV CELLRANGER_ARC_VER=2.0.2 +COPY cellranger-arc-$CELLRANGER_ARC_VER.tar.gz /opt/cellranger-arc-$CELLRANGER_ARC_VER.tar.gz + +# Install cellranger-arc +RUN \ + cd /opt && \ + tar -xzvf cellranger-arc-$CELLRANGER_ARC_VER.tar.gz && \ + export PATH=/opt/cellranger-arc-$CELLRANGER_ARC_VER:$PATH && \ + ln -s /opt/cellranger-arc-$CELLRANGER_ARC_VER/cellranger-arc /usr/bin/cellranger-arc && \ + rm -rf /opt/cellranger-arc-$CELLRANGER_ARC_VER.tar.gz diff --git a/modules/nf-core/cellrangerarc/README.md b/modules/nf-core/cellrangerarc/README.md new file mode 100644 index 00000000..6089d994 --- /dev/null +++ b/modules/nf-core/cellrangerarc/README.md @@ -0,0 +1,23 @@ +# Updating the docker container and making a new module release + +Cell Ranger Arc is a commercial tool from 10X Genomics. The container provided for the cellranger-arc nf-core module is not provided nor supported by 10x Genomics. Updating the Cell Ranger Arc versions in the container and pushing the update to Dockerhub needs to be done manually. + +1. Navigate to the appropriate download page. - [Cell Ranger Arc](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/installation): download the tar ball of the desired Cell Ranger Arc version with `curl` or `wget`. Place this file in the same folder where the Dockerfile lies. + +2. Edit the Dockerfile. Update the Cell Ranger Arc versions in this line: + +```bash +ENV CELLRANGER_ARC_VER= +``` + +3. Create and test the container: + +```bash +docker build . -t quay.io/nf-core/cellranger-arc: +``` + +4. Access rights are needed to push the container to the Dockerhub nfcore organization, please ask a core team member to do so. + +```bash +docker push quay.io/nf-core/cellranger-arc: +``` diff --git a/modules/nf-core/cellrangerarc/count/main.nf b/modules/nf-core/cellrangerarc/count/main.nf new file mode 100644 index 00000000..2bf0193a --- /dev/null +++ b/modules/nf-core/cellrangerarc/count/main.nf @@ -0,0 +1,84 @@ +process CELLRANGERARC_COUNT { + tag "$meta.id" + label 'process_high' + + container "nf-core/cellranger-arc:2.0.2" + + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "CELLRANGERARC_COUNT module does not support Conda. Please use Docker / Singularity / Podman instead." + } + + input: + tuple val(meta), val(sample_type), val(sub_sample), path(reads, stageAs: "fastqs/*") + path reference + + output: + tuple val(meta), path("${meta.id}/outs/*"), emit: outs + path("${meta.id}_lib.csv") , emit: lib + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def reference_name = reference.name + def sample_types = sample_type.join(",") + def sample_names = sub_sample.join(",") + def lib_csv = meta.id + "_lib.csv" + + """ + fastq_folder=\$(readlink -f fastqs) + + python3 < versions.yml + "${task.process}": + cellrangerarc: \$(echo \$( cellranger-arc --version 2>&1) | sed 's/^.*[^0-9]\\([0-9]*\\.[0-9]*\\.[0-9]*\\).*\$/\\1/' ) + END_VERSIONS + """ + + stub: + """ + mkdir -p "${meta.id}/outs/" + touch ${meta.id}/outs/fake_file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cellrangerarc: \$(echo \$( cellranger-arc --version 2>&1) | sed 's/^.*[^0-9]\\([0-9]*\\.[0-9]*\\.[0-9]*\\).*\$/\\1/' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cellrangerarc/count/meta.yml b/modules/nf-core/cellrangerarc/count/meta.yml new file mode 100644 index 00000000..919de4dc --- /dev/null +++ b/modules/nf-core/cellrangerarc/count/meta.yml @@ -0,0 +1,40 @@ +name: cellrangerarc_count +description: Module to use Cell Ranger's ARC pipelines analyze sequencing data produced from Chromium Single Cell ARC. Uses the cellranger-arc count command. +keywords: + - align + - count + - reference +tools: + - cellrangerarc: + description: Cell Ranger ARC is a set of analysis pipelines that process Chromium Single Cell ARC data. + homepage: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc + documentation: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc + tool_dev_url: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc + licence: + - 10x Genomics EULA +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - lib_csv: + type: file + description: | + Path to a 3-column CSV file declaring FASTQ paths, sample names and library types of input ATAC and GEX FASTQs. + - reference: + type: directory + description: Directory containing all the reference indices needed by Cell Ranger ARC +output: + - outs: + type: file + description: Files containing the outputs of Cell Ranger ARC + pattern: "${meta.id}/outs/*" + - versions: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@ggabernet" + - "@Emiller88" + - "@heylf" diff --git a/modules/nf-core/cellrangerarc/mkgtf/main.nf b/modules/nf-core/cellrangerarc/mkgtf/main.nf new file mode 100644 index 00000000..f304c6bc --- /dev/null +++ b/modules/nf-core/cellrangerarc/mkgtf/main.nf @@ -0,0 +1,36 @@ +process CELLRANGERARC_MKGTF { + tag "$gtf" + label 'process_low' + + container "nf-core/cellranger-arc:2.0.2" + + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "CELLRANGERARC_COUNT module does not support Conda. Please use Docker / Singularity / Podman instead." + } + + input: + path gtf + + output: + path "*.filtered.gtf", emit: gtf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + cellranger-arc \\ + mkgtf \\ + $gtf \\ + ${gtf.baseName}.filtered.gtf \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cellrangerarc: \$(echo \$( cellranger-arc --version 2>&1) | sed 's/^.*[^0-9]\\([0-9]*\\.[0-9]*\\.[0-9]*\\).*\$/\\1/' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cellrangerarc/mkgtf/meta.yml b/modules/nf-core/cellrangerarc/mkgtf/meta.yml new file mode 100644 index 00000000..923c3e18 --- /dev/null +++ b/modules/nf-core/cellrangerarc/mkgtf/meta.yml @@ -0,0 +1,32 @@ +name: cellrangerarc_mkgtf +description: Module to build a filtered gtf needed by the 10x Genomics Cell Ranger Arc tool. Uses the cellranger-arc mkgtf command. +keywords: + - reference + - mkref + - index +tools: + - cellrangerarc: + description: Cell Ranger Arc by 10x Genomics is a set of analysis pipelines that process Chromium single-cell data to align reads, generate feature-barcode matrices, perform clustering and other secondary analysis, and more. + homepage: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc + documentation: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc + tool_dev_url: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc + licence: + - 10x Genomics EULA +input: + - gtf: + type: file + description: The reference GTF transcriptome file + pattern: "*.gtf" +output: + - gtf: + type: directory + description: The filtered GTF transcriptome file + pattern: "*.filtered.gtf" + - versions: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@ggabernet" + - "@Emiller88" + - "@heylf" diff --git a/modules/nf-core/cellrangerarc/mkref/main.nf b/modules/nf-core/cellrangerarc/mkref/main.nf new file mode 100644 index 00000000..079776ba --- /dev/null +++ b/modules/nf-core/cellrangerarc/mkref/main.nf @@ -0,0 +1,83 @@ +process CELLRANGERARC_MKREF { + tag "$reference_name" + label 'process_medium' + + container "nf-core/cellranger-arc:2.0.2" + + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "CELLRANGERARC_COUNT module does not support Conda. Please use Docker / Singularity / Podman instead." + } + + input: + path fasta + path gtf + path motifs + path reference_config + val reference_name + + output: + path "${reference_name}", emit: reference + path "config" , emit: config + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def fast_name = fasta.name + def gtf_name = gtf.name + def motifs_name = motifs.name + def reference_config = reference_config.name + def args = task.ext.args ?: '' + + if ( !reference_name ){ + reference_name = "cellrangerarc_reference" + } + + """ + + python3 < versions.yml + "${task.process}": + cellrangerarc: \$(echo \$( cellranger-arc --version 2>&1) | sed 's/^.*[^0-9]\\([0-9]*\\.[0-9]*\\.[0-9]*\\).*\$/\\1/' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cellrangerarc/mkref/meta.yml b/modules/nf-core/cellrangerarc/mkref/meta.yml new file mode 100644 index 00000000..cf98e60c --- /dev/null +++ b/modules/nf-core/cellrangerarc/mkref/meta.yml @@ -0,0 +1,46 @@ +name: cellrangerarc_mkref +description: Module to build the reference needed by the 10x Genomics Cell Ranger Arc tool. Uses the cellranger-arc mkref command. +keywords: + - reference + - mkref + - index +tools: + - cellrangerarc: + description: Cell Ranger Arc is a set of analysis pipelines that process Chromium Single Cell Arc data. + homepage: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc + documentation: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc + tool_dev_url: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc + licence: + - 10x Genomics EULA +input: + - fasta: + type: file + description: Reference genome FASTA file + pattern: "*.{fasta,fa}" + - gtf: + type: file + description: Reference transcriptome GTF file + pattern: "*.gtf" + - motifs: + type: file + description: Sequence motif file (e.g., from transcription factors) + pattern: "*.txt" + - reference_config: + type: file + description: JSON-like file holding organism, genome, reference fasta path, reference annotation gtf path, contigs that should be excluded and sequence format motif file path + pattern: config + - reference_name: + type: string + description: The name to give the new reference folder + pattern: str +output: + - reference: + type: directory + description: Folder called like the reference_name containing all the reference indices needed by Cell Ranger Arc + - versions: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@ggabernet" + - "@heylf" diff --git a/nextflow.config b/nextflow.config index 181aa724..628bb881 100644 --- a/nextflow.config +++ b/nextflow.config @@ -40,6 +40,11 @@ params { // Cellranger parameters cellranger_index = null + // Cellranger ARC parameters + motifs = null + cellrangerarc_config = null + cellrangerarc_reference = null + // UniverSC paramaters universc_index = null @@ -203,6 +208,7 @@ profiles { } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } + test_multiome { includeConfig 'conf/test_multiome.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile diff --git a/nextflow_schema.json b/nextflow_schema.json index d3d5fec6..34af4c64 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -58,7 +58,7 @@ "default": "alevin", "help_text": "The workflow can handle three types of methods:\n\n- Kallisto/Bustools\n- Salmon Alevin + AlevinQC\n- STARsolo\n\nTo choose which one to use, please specify either `alevin`, `star` or `kallisto` as a parameter option for `--aligner`. By default, the pipeline runs the `alevin` option. Note that specifying another aligner option also requires choosing appropriate parameters (see below) for the selected option.", "fa_icon": "fas fa-align-center", - "enum": ["kallisto", "star", "alevin", "cellranger", "universc"] + "enum": ["kallisto", "star", "alevin", "cellranger", "cellrangerarc", "universc"] }, "protocol": { "type": "string", @@ -234,6 +234,26 @@ } } }, + "cellrangerarc_options": { + "title": "Cellranger ARC Options", + "type": "object", + "description": "Params related to the Cellranger pipeline", + "default": "", + "properties": { + "motifs": { + "type": "string", + "description": "Specify a motif file to create a cellranger-arc index. Can be taken, e.g., from the JASPAR database." + }, + "cellrangerarc_config": { + "type": "string", + "description": "Specify a config file to create the cellranger-arc index." + }, + "cellrangerarc_reference": { + "type": "string", + "description": "Specify the genome reference name used in the config file to create a cellranger-arc index." + } + } + }, "universc_options": { "title": "UniverSC Options", "type": "object", @@ -466,6 +486,9 @@ { "$ref": "#/definitions/cellranger_options" }, + { + "$ref": "#/definitions/cellrangerarc_options" + }, { "$ref": "#/definitions/universc_options" }, diff --git a/subworkflows/local/align_cellrangerarc.nf b/subworkflows/local/align_cellrangerarc.nf new file mode 100644 index 00000000..3232a020 --- /dev/null +++ b/subworkflows/local/align_cellrangerarc.nf @@ -0,0 +1,56 @@ +/* + * Alignment with Cellranger Arc + */ + +include {CELLRANGERARC_MKGTF} from "../../modules/nf-core/cellrangerarc/mkgtf/main.nf" +include {CELLRANGERARC_MKREF} from "../../modules/nf-core/cellrangerarc/mkref/main.nf" +include {CELLRANGERARC_COUNT} from "../../modules/nf-core/cellrangerarc/count/main.nf" + +// Define workflow to subset and index a genome region fasta file +workflow CELLRANGERARC_ALIGN { + take: + fasta + gtf + motifs + cellranger_index + ch_fastq + cellrangerarc_config + + main: + ch_versions = Channel.empty() + + assert cellranger_index || (fasta && gtf): + "Must provide either a cellranger index or a bundle of a fasta file ('--fasta') + gtf file ('--gtf')." + + if (!cellranger_index) { + // Filter GTF based on gene biotypes passed in params.modules + CELLRANGERARC_MKGTF( gtf ) + filtered_gtf = CELLRANGERARC_MKGTF.out.gtf + ch_versions = ch_versions.mix(CELLRANGERARC_MKGTF.out.versions) + + // Make reference genome + assert ( ( !params.cellrangerarc_reference && !cellrangerarc_config ) || + ( params.cellrangerarc_reference && cellrangerarc_config ) ) : + "If you provide a config file you also have to specific the reference name and vice versa." + + cellrangerarc_reference = 'cellrangerarc_reference' + if ( params.cellrangerarc_reference ){ + cellrangerarc_reference = params.cellrangerarc_reference + } + + CELLRANGERARC_MKREF( fasta, filtered_gtf, motifs, cellrangerarc_config, cellrangerarc_reference ) + ch_versions = ch_versions.mix(CELLRANGERARC_MKREF.out.versions) + cellranger_index = CELLRANGERARC_MKREF.out.reference + } + + // Obtain read counts + CELLRANGERARC_COUNT ( + ch_fastq, + cellranger_index + ) + ch_versions = ch_versions.mix(CELLRANGERARC_COUNT.out.versions) + + emit: + ch_versions + cellranger_arc_out = CELLRANGERARC_COUNT.out.outs +} \ No newline at end of file diff --git a/subworkflows/local/fastqc.nf b/subworkflows/local/fastqc.nf index f18214a1..6825a9e0 100644 --- a/subworkflows/local/fastqc.nf +++ b/subworkflows/local/fastqc.nf @@ -8,9 +8,9 @@ workflow FASTQC_CHECK { ch_fastq main: - ch_fastq - .map { ch -> [ ch[0], ch[1] ] } - .set { ch_fastq } + + def n = (params.aligner == 'cellrangerarc') ? 3 : 1 + ch_fastq.map { ch -> [ ch[0], ch[n] ] }.set { ch_fastq } /* * FastQ QC using FASTQC diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index f5a11b18..2e06e889 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -10,21 +10,35 @@ workflow INPUT_CHECK { samplesheet // file: /path/to/samplesheet.csv main: + + reads = null + versions = null + + grouped_ch = SAMPLESHEET_CHECK ( samplesheet ) .csv .splitCsv ( header:true, sep:',' ) .map { create_fastq_channel(it) } - .groupTuple(by: [0]) // group replicate files together, modifies channel to [ val(meta), [ [reads_rep1], [reads_repN] ] ] - .map { meta, reads -> [ meta, reads.flatten() ] } // needs to flatten due to last "groupTuple", so we now have reads as a single array as expected by nf-core modules: [ val(meta), [ reads ] ] + // group replicate files together, modifies channel to [ val(meta), [ [reads_rep1], [reads_repN] ] ] + .groupTuple(by: [0]) + + if (params.aligner == 'cellrangerarc' ) { + grouped_ch + .map { meta, sample_type, sub_sample, reads -> [ meta, sample_type.flatten(), sub_sample.flatten(), reads.flatten() ] } + .set { reads } + } else { + grouped_ch + .map { meta, reads -> [ meta, reads.flatten() ] } .set { reads } + } emit: - reads // channel: [ val(meta), [ reads ] ] + reads // channel: [ val(meta), [*], [ reads ] ] versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] } -// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] +// Function to get list of [ meta, [ multimeta ] , [ fastq_1, fastq_2 ] ] def create_fastq_channel(LinkedHashMap row) { // create meta map def meta = [:] @@ -35,16 +49,48 @@ def create_fastq_channel(LinkedHashMap row) { // add path(s) of the fastq file(s) to the meta map def fastq_meta = [] + def fastqs = [] if (!file(row.fastq_1).exists()) { exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}" } if (meta.single_end) { - fastq_meta = [ meta, [ file(row.fastq_1) ] ] + fastqs = [ file(row.fastq_1) ] } else { if (!file(row.fastq_2).exists()) { exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] + fastqs = [ file(row.fastq_1), file(row.fastq_2) ] + if (row.sample_type == "atac") { + if (row.fastq_barcode == "") { + exit 1, "ERROR: Please check input samplesheet -> Barcode FastQ (Dual index i5 read) file is missing!\n" + } + if (!file(row.fastq_barcode).exists()) { + exit 1, "ERROR: Please check input samplesheet -> Barcode FastQ (Dual index i5 read) file does not exist!" + + "\n${row.fastq_barcode}" + } + fastqs.add(file(row.fastq_barcode)) + } + } + + // define meta_data for multiome + def sample_type = row.sample_type ? [row.sample_type] : ['gex'] + + def sub_sample = "" + if (params.aligner == "cellrangerarc"){ + sub_sample = row.fastq_1.split("/")[-1].replaceAll("_S[0-9]+_L[0-9]+_R1_[0-9]+.fastq.gz","") + fastqs.each{ + if(!it.name.contains(sub_sample)){ + exit 1, "ERROR: Please check input samplesheet -> Some files do not have the same sample name " + + "${sub_sample} in common!\n${it}" + } + } + } + + fastq_meta = [ meta, fastqs ] + + if (params.aligner == "cellrangerarc"){ + fastq_meta = [ meta, sample_type, sub_sample, fastqs ] } + return fastq_meta -} +} \ No newline at end of file diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf index 1a8381ce..958da400 100644 --- a/subworkflows/local/mtx_conversion.nf +++ b/subworkflows/local/mtx_conversion.nf @@ -15,7 +15,7 @@ workflow MTX_CONVERSION { ch_versions = Channel.empty() // Cellranger module output contains too many files which cause path collisions, we filter to the ones we need. - if ( params.aligner == "cellranger" ) { + if (params.aligner in [ 'cellranger', 'cellrangerarc' ]) { mtx_matrices = mtx_matrices.map { meta, mtx_files -> [ meta, mtx_files.findAll { it.toString().contains("filtered_feature_bc_matrix") } ] } diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index aeed5a0a..0f37a17f 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -43,9 +43,11 @@ include { KALLISTO_BUSTOOLS } from '../subworkflows/local/kallisto_bustools' include { SCRNASEQ_ALEVIN } from '../subworkflows/local/alevin' include { STARSOLO } from '../subworkflows/local/starsolo' include { CELLRANGER_ALIGN } from "../subworkflows/local/align_cellranger" +include { CELLRANGERARC_ALIGN } from "../subworkflows/local/align_cellrangerarc" include { UNIVERSC_ALIGN } from "../subworkflows/local/align_universc" include { MTX_CONVERSION } from "../subworkflows/local/mtx_conversion" include { GTF_GENE_FILTER } from '../modules/local/gtf_gene_filter' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS @@ -78,6 +80,8 @@ ch_input = file(params.input) ch_genome_fasta = Channel.value(params.fasta ? file(params.fasta) : []) ch_gtf = params.gtf ? file(params.gtf) : [] ch_transcript_fasta = params.transcript_fasta ? file(params.transcript_fasta): [] +ch_motifs = params.motifs ? file(params.motifs) : [] +ch_cellrangerarc_config = params.cellrangerarc_config ? file(params.cellrangerarc_config) : [] ch_txp2gene = params.txp2gene ? file(params.txp2gene) : [] ch_multiqc_alevin = Channel.empty() ch_multiqc_star = Channel.empty() @@ -214,6 +218,20 @@ workflow SCRNASEQ { ch_mtx_matrices = ch_mtx_matrices.mix(UNIVERSC_ALIGN.out.universc_out) } + // Run cellranger pipeline + if (params.aligner == "cellrangerarc") { + CELLRANGERARC_ALIGN( + ch_genome_fasta, + ch_filter_gtf, + ch_motifs, + ch_cellranger_index, + ch_fastq, + ch_cellrangerarc_config + ) + ch_versions = ch_versions.mix(CELLRANGERARC_ALIGN.out.ch_versions) + ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGERARC_ALIGN.out.cellranger_arc_out) + } + // Run mtx to h5ad conversion subworkflow MTX_CONVERSION ( ch_mtx_matrices,