Add cellranger_mkref (#164)

* initial commit dedup * Revert "initial commit dedup" This reverts commit 38f586b. * mkref component - config, test data, scripts, changelog * Small updates * Update CHANGELOG * Update src/cellranger/cellranger_mkref/script.sh --------- Co-authored-by: DriesSchaumont <[email protected]> Co-authored-by: Robrecht Cannoodt <[email protected]>
viash-hub · Nov 8, 2024 · 1d17ce0 · 1d17ce0
1 parent b3fcd52
commit 1d17ce0
Show file tree

Hide file tree

Showing 8 changed files with 273 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,9 @@
 
 * `rsem/rsem_calculate_expression`: Calculate expression levels (PR #93).
 
+* `cellranger`:
+  - `cellranger/cellranger_mkref`: Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA and gene GTF files (PR #164).
+
 * `rseqc`:
   - `rseqc/rseqc_inner_distance`: Calculate inner distance between read pairs (PR #159).
   - `rseqc/rseqc_inferexperiment`: Infer strandedness from sequencing reads (PR #158).

diff --git a/src/cellranger/cellranger_mkref/config.vsh.yaml b/src/cellranger/cellranger_mkref/config.vsh.yaml
@@ -0,0 +1,68 @@
+name: cellranger_mkref
+namespace: cellranger
+description: Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA and gene GTF files.
+keywords: [ cellranger, single-cell, rna-seq, alignment, reference, gtf, fasta ]
+links:
+  documentation: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/references
+  repository: https://github.com/10XGenomics/cellranger/blob/main/lib/python/cellranger/reference_builder.py
+  homepage: https://www.10xgenomics.com/support/software/cell-ranger/latest
+  issue_tracker: https://github.com/10XGenomics/cellranger/issues
+references:
+  doi: 10.1038/ncomms14049
+license: Proprietary
+requirements:
+  commands: [cellranger, pigz, unpigz, tar]
+authors:
+  - __merge__: /src/_authors/emma_rousseau.yaml
+    roles: [ author ]
+arguments:
+  # inputs
+  - type: file
+    name: --genome_fasta
+    required: true
+    description: Reference genome fasta.
+    example: genome_sequence.fa.gz
+  - type: file
+    name: --transcriptome_gtf
+    required: true
+    description: Reference transcriptome annotation.
+    example: transcriptome_annotation.gtf.gz
+  - type: string
+    name: "--reference_version"
+    required: false
+    description: "Optional reference version string to include with reference"
+  - type: file
+    name: --output
+    direction: output
+    required: true
+    description: Output folder
+    example: cellranger_reference
+resources:
+  - type: bash_script
+    path: script.sh
+test_resources:
+  - type: bash_script
+    path: test.sh
+  - path: test_data
+
+engines:
+- type: docker
+  image: ghcr.io/data-intuitive/cellranger:8.0
+  setup:
+    - type: apt
+      packages:
+        - procps
+        - pigz
+  test_setup:
+    - type: apt
+      packages:
+        - seqkit
+    - type: docker
+      run: |
+        cellranger --version | sed 's/ cellranger-/: /' > /var/software_versions.txt
+
+runners:
+- type: executable
+- type: nextflow
+  directives:
+    label: [ highmem, highcpu ]
diff --git a/src/cellranger/cellranger_mkref/help.txt b/src/cellranger/cellranger_mkref/help.txt
@@ -0,0 +1,71 @@
+```
+cellranger mkref -h
+```
+Prepare a reference for use with 10x analysis software. Requires a GTF and
+FASTA
+
+Usage: cellranger mkref [OPTIONS] --genome <GENOME_NAMES> --fasta <FASTA_FILES> --genes <GTF_FILES>
+
+Options:
+      --genome <GENOME_NAMES>
+          Unique genome name, used to name output folder [a-zA-Z0-9_-]+.
+          Specify multiple genomes by specifying this argument multiple
+          times; the output folder will be <name1>_and_<name2>
+      --fasta <FASTA_FILES>
+          Path to FASTA file containing your genome reference. Specify
+          multiple genomes by specifying this argument multiple times
+      --genes <GTF_FILES>
+          Path to genes GTF file containing annotated genes for your genome
+          reference. Specify multiple genomes by specifying this argument
+          multiple times
+      --nthreads <NUM_THREADS>
+          Number of threads used during STAR genome index generation.
+          Defaults to 1 [default: 1]
+      --memgb <MEM_GB>
+          Maximum memory (GB) used [default: 16]
+      --ref-version <REF_VERSION>
+          Optional reference version string to include with reference
+      --dry
+          Do not execute the pipeline. Generate a pipeline invocation (.mro)
+          file and stop
+      --jobmode <MODE>
+          Job manager to use. Valid options: local (default), sge, lsf,
+          slurm or path to a .template file. Search for help on "Cluster
+          Mode" at support.10xgenomics.com for more details on configuring
+          the pipeline to use a compute cluster
+      --localcores <NUM>
+          Set max cores the pipeline may request at one time. Only applies
+          to local jobs
+      --localmem <NUM>
+          Set max GB the pipeline may request at one time. Only applies to
+          local jobs
+      --localvmem <NUM>
+          Set max virtual address space in GB for the pipeline. Only applies
+          to local jobs
+      --mempercore <NUM>
+          Reserve enough threads for each job to ensure enough memory will
+          be available, assuming each core on your cluster has at least this
+          much memory available. Only applies to cluster jobmodes
+      --maxjobs <NUM>
+          Set max jobs submitted to cluster at one time. Only applies to
+          cluster jobmodes
+      --jobinterval <NUM>
+          Set delay between submitting jobs to cluster, in ms. Only applies
+          to cluster jobmodes
+      --overrides <PATH>
+          The path to a JSON file that specifies stage-level overrides for
+          cores and memory. Finer-grained than --localcores, --mempercore
+          and --localmem. Consult https://support.10xgenomics.com/ for an
+          example override file
+      --output-dir <PATH>
+          Output the results to this directory
+      --uiport <PORT>
+          Serve web UI at http://localhost:PORT
+      --disable-ui
+          Do not serve the web UI
+      --noexit
+          Keep web UI running after pipestance completes or fails
+      --nopreflight
+          Skip preflight checks
+  -h, --help
+          Print help
diff --git a/src/cellranger/cellranger_mkref/script.sh b/src/cellranger/cellranger_mkref/script.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -eo pipefail
+
+## VIASH START
+par_genome_fasta="test_data/reference_small.fa.gz"
+par_transcriptome_gtf="test_data/reference_small.gtf.gz"
+par_output="output.tar.gz"
+## VIASH END
+
+# create temporary directory
+tmpdir=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXXXX")
+function clean_up {
+    rm -rf "$tmpdir"
+}
+trap clean_up EXIT
+
+# We change into the tempdir later, so we need absolute paths.
+par_genome_fasta=$(realpath $par_genome_fasta)
+par_transcriptome_gtf=$(realpath $par_transcriptome_gtf)
+par_output=$(realpath $par_output)
+
+
+echo "> Unzipping input files"
+unpigz -c "$par_genome_fasta" > "$tmpdir/genome.fa"
+
+echo "> Building star index"
+cd "$tmpdir"
+cellranger mkref \
+  --fasta "$tmpdir/genome.fa" \
+  --genes "$par_transcriptome_gtf" \
+  --genome output \
+  ${par_reference_version:+--ref-version $par_reference_version} \
+  ${meta_cpus:+--nthreads $meta_cpus} \
+  ${meta_memory_gb:+--memgb $(($meta_memory_gb-2))} # always keep 2 gb for the OS itself
+
+echo "> Creating archive"
+tar --use-compress-program="pigz -k " -cf "$par_output" -C "$tmpdir/output" .
diff --git a/src/cellranger/cellranger_mkref/test.sh b/src/cellranger/cellranger_mkref/test.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -eou pipefail
+
+## VIASH START
+meta_executable="viash run src/reference/make_reference/config.vsh.yaml --"
+## VIASH END
+
+# create temporary directory
+tmpdir=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXXXX")
+function clean_up {
+    rm -rf "$tmpdir"
+}
+trap clean_up EXIT
+
+function seqkit_head {
+  input="$1"
+  output="$2"
+  if [[ ! -f "$output" ]]; then
+    echo "> Processing $(basename $input)"
+    seqkit subseq -r 1:50000 "$input" | gzip > "$output"
+  fi
+}
+
+seqkit_head "$meta_resources_dir/test_data/reference_small.fa.gz" "$tmpdir/reference_small.fa.gz"
+zcat "$meta_resources_dir/test_data/reference_small.gtf.gz" | awk '$4 < 50001 {print ;}' | gzip > "$tmpdir/reference_small.gtf.gz"
+
+echo "> Running $meta_name, writing to $tmpdir."
+$meta_executable \
+  --genome_fasta "$tmpdir/reference_small.fa.gz" \
+  --transcriptome_gtf "$tmpdir/reference_small.gtf.gz" \
+  --output "$tmpdir/myreference.tar.gz" \
+  ---cpus ${meta_memory_gb:-1} \
+  ---memory ${meta_memory_gb:-5}GB
+
+exit_code=$?
+[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1
+
+echo ">> Checking whether output can be found"
+[[ ! -f "$tmpdir/myreference.tar.gz" ]] && echo "Output tar file could not be found!" && exit 1
+
+echo "> Test succeeded!"
diff --git a/src/cellranger/cellranger_mkref/test_data/reference_small.fa.gz b/src/cellranger/cellranger_mkref/test_data/reference_small.fa.gz
diff --git a/src/cellranger/cellranger_mkref/test_data/reference_small.gtf.gz b/src/cellranger/cellranger_mkref/test_data/reference_small.gtf.gz
diff --git a/src/cellranger/cellranger_mkref/test_data/script.sh b/src/cellranger/cellranger_mkref/test_data/script.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+TMP_DIR=tmp/cellranger_make_reference
+OUT_DIR=src/cellranger/cellranger_mkref/test_data
+
+# check if seqkit is installed
+if ! command -v seqkit &> /dev/null; then
+  echo "seqkit could not be found"
+  exit 1
+fi
+
+# create temporary directory and clean up on exit
+mkdir -p $TMP_DIR
+function clean_up {
+    rm -rf "$TMP_DIR"
+}
+trap clean_up EXIT
+
+# fetch reference
+ORIG_FA=$TMP_DIR/reference.fa.gz
+if [ ! -f $ORIG_FA ]; then
+  wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz \
+    -O $ORIG_FA
+fi
+
+ORIG_GTF=$TMP_DIR/reference.gtf.gz
+if [ ! -f $ORIG_GTF ]; then
+  wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz \
+    -O $ORIG_GTF
+fi
+
+# create small reference
+START=30000
+END=31500
+CHR=chr1
+
+touch $OUT_DIR/reference_small.fa
+# subset to small region
+seqkit grep -r -p "^$CHR\$" "$ORIG_FA" | \
+  seqkit subseq -r "$START:$END" > $OUT_DIR/reference_small.fa
+
+touch $OUT_DIR/reference_small.gtf
+gunzip -c "$ORIG_GTF" | awk -v FS='\t' -v OFS='\t' "
+    \$1 == \"$CHR\" && \$4 >= $START && \$5 <= $END {
+      \$4 = \$4 - $START + 1;
+      \$5 = \$5 - $START + 1;
+      print;
+    }" > $OUT_DIR/reference_small.gtf
+
+gzip $OUT_DIR/reference_small.fa
+gzip $OUT_DIR/reference_small.gtf