nf-core · grst · Dec 22, 2023 · Jun 30, 2023 · Jun 30, 2023 · Jun 30, 2023
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -87,7 +87,8 @@ def check_samplesheet(file_in, file_out):
         ## Check header
         MIN_COLS = 2
         MIN_HEADER = ["sample", "fastq_1", "fastq_2"]
-        OPT_HEADER = ["expected_cells", "seq_center"]
+        OPT_HEADER = ["expected_cells", "seq_center", "fastq_barcode", "sample_type"]
+        SAMPLE_TYPES = ["gex", "atac"]
         header = [x.strip('"') for x in fin.readline().strip().split(",")]
 
         unknown_header = 0
@@ -101,8 +102,7 @@ def check_samplesheet(file_in, file_out):
                 min_header_count = min_header_count + 1
             colmap[h] = i
             i = i + 1
-        if min_header_count < len(MIN_HEADER):
-            # code was checking for unknown_header or min_header_count however looking at the ifelse, unknown_header does not seem that it should be tested
+        if unknown_header or min_header_count < len(MIN_HEADER):
             given = ",".join(header)
             wanted = ",".join(MIN_HEADER)
             print(f"ERROR: Please check samplesheet header -> {given} != {wanted}")
@@ -147,7 +147,26 @@ def check_samplesheet(file_in, file_out):
                 seq_center = seq_center.replace(" ", "_")
 
             ## Check FastQ file extension
-            for fastq in [fastq_1, fastq_2]:
+            fastq_list = [fastq_1, fastq_2]
+
+            fastq_barcode = ""
+            if "fastq_barcode" in header:
+                fastq_barcode = lspl[colmap["fastq_barcode"]]
+                fastq_list.append(fastq_barcode)
+
+            sample_type = ""
+            if "sample_type" in header:
+                sample_type = lspl[colmap["sample_type"]]
+                if sample_type not in SAMPLE_TYPES:
+                    print_error(
+                        "Sample type {} is not supported! Please specify either {}".format(
+                            sample_type, " or ".join(SAMPLE_TYPES)
+                        ),
+                        "Line",
+                        line,
+                    )
+
+            for fastq in fastq_list:
                 if fastq:
                     if fastq.find(" ") != -1:
                         print_error("FastQ file contains spaces!", "Line", line)
@@ -161,9 +180,9 @@ def check_samplesheet(file_in, file_out):
             ## Auto-detect paired-end/single-end
             sample_info = []  ## [single_end, fastq_1, fastq_2]
             if sample and fastq_1 and fastq_2:  ## Paired-end short reads
-                sample_info = ["0", fastq_1, fastq_2, expected_cells, seq_center]
+                sample_info = ["0", fastq_1, fastq_2, expected_cells, seq_center, fastq_barcode, sample_type]
             elif sample and fastq_1 and not fastq_2:  ## Single-end short reads
-                sample_info = ["1", fastq_1, fastq_2, expected_cells, seq_center]
+                sample_info = ["1", fastq_1, fastq_2, expected_cells, seq_center, fastq_barcode, sample_type]
             else:
                 print_error("Invalid combination of columns provided!", "Line", line)
 
@@ -180,7 +199,21 @@ def check_samplesheet(file_in, file_out):
     ## Write validated samplesheet with appropriate columns
     if len(sample_mapping_dict) > 0:
         with open(file_out, "w") as fout:
-            fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2", "expected_cells", "seq_center"]) + "\n")
+            fout.write(
+                ",".join(
+                    [
+                        "sample",
+                        "single_end",
+                        "fastq_1",
+                        "fastq_2",
+                        "expected_cells",
+                        "seq_center",
+                        "fastq_barcode",
+                        "sample_type",
+                    ]
+                )
+                + "\n"
+            )
             for sample in sorted(sample_mapping_dict.keys()):
                 ## Check that multiple runs of the same sample are of the same datatype
                 if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]):

diff --git a/bin/generate_config.py b/bin/generate_config.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+import argparse
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate the config for cellranger-arc mkref. \
+                                     cellranger-arc mkref takes as input a configuration file that bundles various inputs to the tool. \
+                                     You can also create a config file on your own, please find more information here:\
+                                     https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/advanced/references"
+    )
+
+    parser.add_argument("-f", "--fasta", dest="fasta", help="Name of the fasta file.", required=True)
+    parser.add_argument("-g", "--gtf", dest="gtf", help="Name of the gtf file.", required=True)
+    parser.add_argument("-m", "--motifs", dest="motifs", help="Name of the motifs file.")
+    parser.add_argument("-a", "--add", dest="add", help="Additional filter line.")
+
+    args = vars(parser.parse_args())
+
+    print(args)
+
+    config = open("config", "w")
+    config.write("{\n")
+    config.write('\torganism: "{}"\n'.format(args["fasta"].split(".")[0]))
+    config.write('\tgenome: ["cellrangerarc_reference"]\n')
+    config.write('\tinput_fasta: ["{}"]\n'.format(args["fasta"]))
+    config.write('\tinput_gtf: ["{}"]\n'.format(args["gtf"]))
+    if args["motifs"] != "[]":
+        config.write('\tinput_motifs: "{}"\n'.format(args["motifs"]))
+    if args["add"] != None:
+        config.write(args["add"] + "\n")
+    config.write("}")
+    config.close()
+
+    print("Wrote config file")
diff --git a/bin/generate_lib_csv.py b/bin/generate_lib_csv.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+import argparse
+import os
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate the lib.csv for cellranger-arc.")
+
+    parser.add_argument("-t", "--sample_types", dest="sample_types", help="Comma seperated list of sample types.")
+    parser.add_argument("-n", "--sample_names", dest="sample_names", help="Comma seperated list of sample names.")
+    parser.add_argument("-f", "--fastq_folder", dest="fastq_folder", help="Folder of FASTQ files.")
+    parser.add_argument("-o", "--out", dest="out", help="Output path.")
+
+    args = vars(parser.parse_args())
+
+    print(args)
+
+    sample_types = args["sample_types"].split(",")
+    sample_names = args["sample_names"].split(",")
+    unique_samples_names = set(sample_names)
+
+    lib_csv = open(args["out"], "w")
+    lib_csv.write("fastqs,sample,library_type")
+
+    for i in range(0, len(sample_types)):
+        if sample_names[i] in unique_samples_names:
+            unique_samples_names.remove(
+                sample_names[i]
+            )  # this has to be done to account for different Lane files (e.g., L002)
+            if sample_types[i] == "gex":
+                lib_csv.write("\n{},{},{}".format(args["fastq_folder"], sample_names[i], "Gene Expression"))
+            else:
+                lib_csv.write("\n{},{},{}".format(args["fastq_folder"], sample_names[i], "Chromatin Accessibility"))
+
+    lib_csv.close()
+
+    print("Wrote lib.csv file to {}".format(args["out"]))
diff --git a/conf/modules.config b/conf/modules.config
@@ -83,6 +83,39 @@ if(params.aligner == "cellranger") {
     }
 }
 
+if(params.aligner == "cellrangerarc") {
+    process {
+        withName: CELLRANGERARC_MKGTF {
+            publishDir = [
+                path: "${params.outdir}/${params.aligner}/mkgtf",
+                mode: params.publish_dir_mode,
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            ]
+            ext.args = "--attribute=gene_biotype:protein_coding --attribute=gene_biotype:lncRNA --attribute=gene_biotype:pseudogene"
+        }
+        withName: CELLRANGERARC_GENERATECONFIG {
+            publishDir = [
+                path: "${params.outdir}/${params.aligner}/config",
+                mode: params.publish_dir_mode
+            ]
+            ext.args = "--add none"
+        }
+        withName: CELLRANGERARC_MKREF {
+            publishDir = [
+                path: "${params.outdir}/${params.aligner}/mkref",
+                mode: params.publish_dir_mode
+            ]
+        }
+        withName: CELLRANGERARC_COUNT {
+            publishDir = [
+                path: "${params.outdir}/${params.aligner}/count",
+                mode: params.publish_dir_mode
+            ]
+            ext.args = {meta.expected_cells ? "--expect-cells ${meta.expected_cells}" : ''}
+        }
+    }
+}
+
 if(params.aligner == "universc") {
     process {
         publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }

diff --git a/docs/output.md b/docs/output.md
@@ -17,6 +17,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
   - [STARsolo](#starsolo)
   - [Salmon Alevin & AlevinQC](#salmon-alevin--alevinqc)
   - [Cellranger](#cellranger)
+  - [Cellranger ARC](#cellranger-arc)
   - [UniverSC](#universc)
   - [Other output data](#other-output-data)
   - [MultiQC](#multiqc)
@@ -103,6 +104,14 @@ Cell Ranger is a set of analysis scripts that processes 10X Chromium single cell
 
 - Contains the mapped BAM files, filtered and unfiltered HDF5 matrices and output metrics created by Cellranger
 
+## Cellranger ARC
+
+Cell Ranger ARC is a set of analysis pipelines that process Chromium Single Cell Multiome ATAC + Gene Expression sequencing data to generate a variety of analyses pertaining to gene expression (GEX), chromatin accessibility, and their linkage. Furthermore, since the ATAC and GEX measurements are on the very same cell, we are able to perform analyses that link chromatin accessibility and GEX. See [Cellranger ARC](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc) for more information on Cellranger.
+
+**Output directory: `results/cellrangerarc`**
+
+- Contains the mapped BAM files, filtered and unfiltered HDF5 matrices and output metrics created by Cellranger ARC
+
 ## UniverSC
 
 UniverSC is a wrapper that calls an open-source implementation of Cell Ranger v3.0.2 and adjusts run parameters for compatibility with a wide ranger of technologies.

diff --git a/docs/usage.md b/docs/usage.md
@@ -97,6 +97,51 @@ UniverSC automatically updates the barcode whitelist and chemistry parameters. U
 
 Currently only 3\' scRNA-Seq parameters are supported in nextflow, although chemistry parameters for 5\' scRNA-Seq and full-length scRNA-Seq libraries are supported by teh container.
 
+### If using cellranger-arc
+
+#### Automatic file name detection
+
+This pipeline currently **does not** automatically renames input FASTQ files to follow the
+[naming convention by 10x](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input):
+
+```
+[Sample Name]_S1_L00[Lane Number]_[Read Type]_001.fastq.gz
+```
+
+Thus please make sure your files follow this naming convention.
+
+#### Sample sheet definition
+
+If you are using cellranger-arc you have to add the column _sample_type_ (atac for scATAC or gex for scRNA) and _fastq_barcode_ (part of the scATAC data) to your samplesheet as an input.
+
+**Beware of the following points:**
+
+- It is important that you give your scRNA and scATAC different [Sample Name]s.
+- Check first which file is your barcode fastq file for your scATAC data ([see](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/using/using/fastq-input)).
+- If you have more than one sequencing run then you have to give them another suffix (e.g., rep\*) to your [Sample Name] ([see](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/using/fastq-input#atac_quick_start)).
+
+An example samplesheet for a dataset called test_scARC that has two sequencing runs for the scATAC and one seqeuncing run
+from two lanes for the scRNA could look like this:
+
+sample,fastq_1,fastq_2,fastq_barcode,sample_type
+test_scARC,path/test_scARC_atac_rep1_S1_L001_R1_001.fastq.gz,path/test_scARC_atac_rep1_S1_L001_R2_001.fastq.gz,path/test_scARC_atac_rep1_S1_L001_I2_001.fastq.gz,atac
+test_scARC,path/test_scARC_atac_rep2_S2_L001_R1_001.fastq.gz,path/test_scARC_atac_rep2_S2_L001_R2_001.fastq.gz,path/test_scARC_atac_rep2_S2_L001_I2_001.fastq.gz,atac
+test_scARC,path/test_scARC_gex_S1_L001_R1_001.fastq.gz,path/test_scARC_gex_S1_L001_R2_001.fastq.gz,,gex
+test_scARC,path/test_scARC_gex_S1_L002_R1_001.fastq.gz,path/test_scARC_gex_S1_L002_R2_001.fastq.gz,,gex
+
+#### Config file and index
+
+Cellranger-arc needs a reference index directory that you can provide with `--cellranger_index`. Be aware, you can use
+for cellranger-arc the same index you use for cellranger ([see](https://kb.10xgenomics.com/hc/en-us/articles/4408281606797-Are-the-references-interchangeable-between-pipelines)).
+Yet, a cellranger-arc index might include additional data (e.g., TF binding motifs). Therefore, please first check if
+you have to create a new cellranger-arc index ([see here](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/advanced/references) for
+more information)
+
+If you decide to create a cellranger-arc index, then you need to create a config file to generate the index. The pipeline
+can do this autmatically for you if you provide a `--fasta`, `--gtf`, and an optional `--motif` file. However, you can
+also decide to provide your own config file with `--cellrangerarc_config`, then you also have to specify with `--cellrangerarc_reference`
+the reference genome name that you have used and stated as _genome:_ in your config file.
+
 ## Running the pipeline
 
 The minimum typical command for running the pipeline is as follows:

diff --git a/modules.json b/modules.json
@@ -64,6 +64,11 @@
                         "branch": "master",
                         "git_sha": "cf67a6d7d043e2bd6a3099be84c72046fc71508f",
                         "installed_by": ["modules"]
+                    },
+                    "unzip": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": ["modules"]
                     }
                 }
             }

diff --git a/modules/local/cellrangerarc/Dockerfile b/modules/local/cellrangerarc/Dockerfile
@@ -0,0 +1,28 @@
+# Dockerfile to create container with Cell Ranger v2.0.2
+# Push to quay.io/nf-core/cellranger-arc:<VER>
+
+FROM continuumio/miniconda3:4.8.2
+LABEL authors="Gisela Gabernet <[email protected]>, Florian Heyl" \
+    description="Docker image containing Cell Ranger Arc"
+# Disclaimer: this container is not provided nor supported by Illumina or 10x Genomics.
+
+# Install procps and clean apt cache
+RUN apt-get update --allow-releaseinfo-change \
+    && apt-get install -y \
+    cpio \
+    procps \
+    rpm2cpio \
+    unzip \
+    && apt-get clean -y && rm -rf /var/lib/apt/lists/*
+
+# Copy pre-downloaded cellranger-arc file
+ENV CELLRANGER_ARC_VER=2.0.2
+COPY cellranger-arc-$CELLRANGER_ARC_VER.tar.gz /opt/cellranger-arc-$CELLRANGER_ARC_VER.tar.gz
+
+# Install cellranger-arc
+RUN \
+    cd /opt && \
+    tar -xzvf cellranger-arc-$CELLRANGER_ARC_VER.tar.gz && \
+    export PATH=/opt/cellranger-arc-$CELLRANGER_ARC_VER:$PATH && \
+    ln -s /opt/cellranger-arc-$CELLRANGER_ARC_VER/cellranger-arc /usr/bin/cellranger-arc && \
+    rm -rf /opt/cellranger-arc-$CELLRANGER_ARC_VER.tar.gz
diff --git a/modules/local/cellrangerarc/README.md b/modules/local/cellrangerarc/README.md
@@ -0,0 +1,23 @@
+# Updating the docker container and making a new module release
+
+Cell Ranger Arc is a commercial tool from 10X Genomics. The container provided for the cellranger-arc nf-core module is not provided nor supported by 10x Genomics. Updating the Cell Ranger Arc versions in the container and pushing the update to Dockerhub needs to be done manually.
+
+1. Navigate to the appropriate download page. - [Cell Ranger Arc](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/installation): download the tar ball of the desired Cell Ranger Arc version with `curl` or `wget`. Place this file in the same folder where the Dockerfile lies.
+
+2. Edit the Dockerfile. Update the Cell Ranger Arc versions in this line:
+
+```bash
+ENV CELLRANGER_ARC_VER=<VERSION>
+```
+
+3. Create and test the container:
+
+```bash
+docker build . -t quay.io/nf-core/cellranger-arc:<VERSION>
+```
+
+4. Access rights are needed to push the container to the Dockerhub nfcore organization, please ask a core team member to do so.
+
+```bash
+docker push quay.io/nf-core/cellranger-arc:<VERSION>
+```