diff --git a/CHANGELOG.md b/CHANGELOG.md index 35aa33b5..d751ca81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,9 @@ * `rsem/rsem_calculate_expression`: Calculate expression levels (PR #93). +* `cellranger`: + - `cellranger/cellranger_mkref`: Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA and gene GTF files (PR #164). + * `rseqc`: - `rseqc/rseqc_inner_distance`: Calculate inner distance between read pairs (PR #159). - `rseqc/rseqc_inferexperiment`: Infer strandedness from sequencing reads (PR #158). diff --git a/src/cellranger/cellranger_mkref/config.vsh.yaml b/src/cellranger/cellranger_mkref/config.vsh.yaml new file mode 100644 index 00000000..b00c8f90 --- /dev/null +++ b/src/cellranger/cellranger_mkref/config.vsh.yaml @@ -0,0 +1,68 @@ +name: cellranger_mkref +namespace: cellranger +description: Build a Cell Ranger-compatible reference folder from user-supplied genome FASTA and gene GTF files. +keywords: [ cellranger, single-cell, rna-seq, alignment, reference, gtf, fasta ] +links: + documentation: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/references + repository: https://github.com/10XGenomics/cellranger/blob/main/lib/python/cellranger/reference_builder.py + homepage: https://www.10xgenomics.com/support/software/cell-ranger/latest + issue_tracker: https://github.com/10XGenomics/cellranger/issues +references: + doi: 10.1038/ncomms14049 +license: Proprietary +requirements: + commands: [cellranger, pigz, unpigz, tar] +authors: + - __merge__: /src/_authors/emma_rousseau.yaml + roles: [ author ] +arguments: + # inputs + - type: file + name: --genome_fasta + required: true + description: Reference genome fasta. + example: genome_sequence.fa.gz + - type: file + name: --transcriptome_gtf + required: true + description: Reference transcriptome annotation. + example: transcriptome_annotation.gtf.gz + - type: string + name: "--reference_version" + required: false + description: "Optional reference version string to include with reference" + - type: file + name: --output + direction: output + required: true + description: Output folder + example: cellranger_reference +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - path: test_data + +engines: +- type: docker + image: ghcr.io/data-intuitive/cellranger:8.0 + setup: + - type: apt + packages: + - procps + - pigz + test_setup: + - type: apt + packages: + - seqkit + - type: docker + run: | + cellranger --version | sed 's/ cellranger-/: /' > /var/software_versions.txt + +runners: +- type: executable +- type: nextflow + directives: + label: [ highmem, highcpu ] diff --git a/src/cellranger/cellranger_mkref/help.txt b/src/cellranger/cellranger_mkref/help.txt new file mode 100644 index 00000000..9cd45cb9 --- /dev/null +++ b/src/cellranger/cellranger_mkref/help.txt @@ -0,0 +1,71 @@ +``` +cellranger mkref -h +``` +Prepare a reference for use with 10x analysis software. Requires a GTF and +FASTA + +Usage: cellranger mkref [OPTIONS] --genome --fasta --genes + +Options: + --genome + Unique genome name, used to name output folder [a-zA-Z0-9_-]+. + Specify multiple genomes by specifying this argument multiple + times; the output folder will be _and_ + --fasta + Path to FASTA file containing your genome reference. Specify + multiple genomes by specifying this argument multiple times + --genes + Path to genes GTF file containing annotated genes for your genome + reference. Specify multiple genomes by specifying this argument + multiple times + --nthreads + Number of threads used during STAR genome index generation. + Defaults to 1 [default: 1] + --memgb + Maximum memory (GB) used [default: 16] + --ref-version + Optional reference version string to include with reference + --dry + Do not execute the pipeline. Generate a pipeline invocation (.mro) + file and stop + --jobmode + Job manager to use. Valid options: local (default), sge, lsf, + slurm or path to a .template file. Search for help on "Cluster + Mode" at support.10xgenomics.com for more details on configuring + the pipeline to use a compute cluster + --localcores + Set max cores the pipeline may request at one time. Only applies + to local jobs + --localmem + Set max GB the pipeline may request at one time. Only applies to + local jobs + --localvmem + Set max virtual address space in GB for the pipeline. Only applies + to local jobs + --mempercore + Reserve enough threads for each job to ensure enough memory will + be available, assuming each core on your cluster has at least this + much memory available. Only applies to cluster jobmodes + --maxjobs + Set max jobs submitted to cluster at one time. Only applies to + cluster jobmodes + --jobinterval + Set delay between submitting jobs to cluster, in ms. Only applies + to cluster jobmodes + --overrides + The path to a JSON file that specifies stage-level overrides for + cores and memory. Finer-grained than --localcores, --mempercore + and --localmem. Consult https://support.10xgenomics.com/ for an + example override file + --output-dir + Output the results to this directory + --uiport + Serve web UI at http://localhost:PORT + --disable-ui + Do not serve the web UI + --noexit + Keep web UI running after pipestance completes or fails + --nopreflight + Skip preflight checks + -h, --help + Print help diff --git a/src/cellranger/cellranger_mkref/script.sh b/src/cellranger/cellranger_mkref/script.sh new file mode 100644 index 00000000..03755998 --- /dev/null +++ b/src/cellranger/cellranger_mkref/script.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +set -eo pipefail + +## VIASH START +par_genome_fasta="test_data/reference_small.fa.gz" +par_transcriptome_gtf="test_data/reference_small.gtf.gz" +par_output="output.tar.gz" +## VIASH END + +# create temporary directory +tmpdir=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXXXX") +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +# We change into the tempdir later, so we need absolute paths. +par_genome_fasta=$(realpath $par_genome_fasta) +par_transcriptome_gtf=$(realpath $par_transcriptome_gtf) +par_output=$(realpath $par_output) + + +echo "> Unzipping input files" +unpigz -c "$par_genome_fasta" > "$tmpdir/genome.fa" + +echo "> Building star index" +cd "$tmpdir" +cellranger mkref \ + --fasta "$tmpdir/genome.fa" \ + --genes "$par_transcriptome_gtf" \ + --genome output \ + ${par_reference_version:+--ref-version $par_reference_version} \ + ${meta_cpus:+--nthreads $meta_cpus} \ + ${meta_memory_gb:+--memgb $(($meta_memory_gb-2))} # always keep 2 gb for the OS itself + +echo "> Creating archive" +tar --use-compress-program="pigz -k " -cf "$par_output" -C "$tmpdir/output" . diff --git a/src/cellranger/cellranger_mkref/test.sh b/src/cellranger/cellranger_mkref/test.sh new file mode 100644 index 00000000..5c5c1f3d --- /dev/null +++ b/src/cellranger/cellranger_mkref/test.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -eou pipefail + +## VIASH START +meta_executable="viash run src/reference/make_reference/config.vsh.yaml --" +## VIASH END + +# create temporary directory +tmpdir=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXXXX") +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +function seqkit_head { + input="$1" + output="$2" + if [[ ! -f "$output" ]]; then + echo "> Processing $(basename $input)" + seqkit subseq -r 1:50000 "$input" | gzip > "$output" + fi +} + +seqkit_head "$meta_resources_dir/test_data/reference_small.fa.gz" "$tmpdir/reference_small.fa.gz" +zcat "$meta_resources_dir/test_data/reference_small.gtf.gz" | awk '$4 < 50001 {print ;}' | gzip > "$tmpdir/reference_small.gtf.gz" + +echo "> Running $meta_name, writing to $tmpdir." +$meta_executable \ + --genome_fasta "$tmpdir/reference_small.fa.gz" \ + --transcriptome_gtf "$tmpdir/reference_small.gtf.gz" \ + --output "$tmpdir/myreference.tar.gz" \ + ---cpus ${meta_memory_gb:-1} \ + ---memory ${meta_memory_gb:-5}GB + +exit_code=$? +[[ $exit_code != 0 ]] && echo "Non zero exit code: $exit_code" && exit 1 + +echo ">> Checking whether output can be found" +[[ ! -f "$tmpdir/myreference.tar.gz" ]] && echo "Output tar file could not be found!" && exit 1 + +echo "> Test succeeded!" \ No newline at end of file diff --git a/src/cellranger/cellranger_mkref/test_data/reference_small.fa.gz b/src/cellranger/cellranger_mkref/test_data/reference_small.fa.gz new file mode 100644 index 00000000..e24b75a9 Binary files /dev/null and b/src/cellranger/cellranger_mkref/test_data/reference_small.fa.gz differ diff --git a/src/cellranger/cellranger_mkref/test_data/reference_small.gtf.gz b/src/cellranger/cellranger_mkref/test_data/reference_small.gtf.gz new file mode 100644 index 00000000..1e3ce7ce Binary files /dev/null and b/src/cellranger/cellranger_mkref/test_data/reference_small.gtf.gz differ diff --git a/src/cellranger/cellranger_mkref/test_data/script.sh b/src/cellranger/cellranger_mkref/test_data/script.sh new file mode 100755 index 00000000..3b09498b --- /dev/null +++ b/src/cellranger/cellranger_mkref/test_data/script.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +TMP_DIR=tmp/cellranger_make_reference +OUT_DIR=src/cellranger/cellranger_mkref/test_data + +# check if seqkit is installed +if ! command -v seqkit &> /dev/null; then + echo "seqkit could not be found" + exit 1 +fi + +# create temporary directory and clean up on exit +mkdir -p $TMP_DIR +function clean_up { + rm -rf "$TMP_DIR" +} +trap clean_up EXIT + +# fetch reference +ORIG_FA=$TMP_DIR/reference.fa.gz +if [ ! -f $ORIG_FA ]; then + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz \ + -O $ORIG_FA +fi + +ORIG_GTF=$TMP_DIR/reference.gtf.gz +if [ ! -f $ORIG_GTF ]; then + wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz \ + -O $ORIG_GTF +fi + +# create small reference +START=30000 +END=31500 +CHR=chr1 + +touch $OUT_DIR/reference_small.fa +# subset to small region +seqkit grep -r -p "^$CHR\$" "$ORIG_FA" | \ + seqkit subseq -r "$START:$END" > $OUT_DIR/reference_small.fa + +touch $OUT_DIR/reference_small.gtf +gunzip -c "$ORIG_GTF" | awk -v FS='\t' -v OFS='\t' " + \$1 == \"$CHR\" && \$4 >= $START && \$5 <= $END { + \$4 = \$4 - $START + 1; + \$5 = \$5 - $START + 1; + print; + }" > $OUT_DIR/reference_small.gtf + +gzip $OUT_DIR/reference_small.fa +gzip $OUT_DIR/reference_small.gtf