Skip to content

Commit

Permalink
three rsem components initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
emmarousseau committed Jul 18, 2024
1 parent 1679c59 commit dc275da
Show file tree
Hide file tree
Showing 21 changed files with 4,634 additions and 0 deletions.
109 changes: 109 additions & 0 deletions src/rsem/rsem_calculate_expression/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
name: "rsem_calculate_expression"
namespace: "rsem"
description: |
Calculate expression with RSEM.
keywords: ["Transcriptome", "Index"]
links:
homepage: https://deweylab.github.io/RSEM/
documentation: https://deweylab.github.io/RSEM/rsem-calculate-expression.html
repository: https://github.com/deweylab/RSEM
references:
doi: https://doi.org/10.1186/1471-2105-12-323
license: GPL-3.0


argument_groups:
- name: "Input"
arguments:
- name: "--id"
type: string
description: Sample ID.
- name: "--strandedness"
type: string
description: Sample strand-specificity. Must be one of unstranded, forward, reverse
choices: [forward, reverse, unstranded]
- name: "--paired"
type: boolean
description: Paired-end reads or not?
- name: "--input"
type: file
description: Input reads for quantification.
multiple: true
- name: "--index"
type: file
description: RSEM index.
- name: "--extra_args"
type: string
description: Extra rsem-calculate-expression arguments in addition to the examples.

- name: "Output"
arguments:
- name: "--counts_gene"
type: file
description: Expression counts on gene level
example: $id.genes.results
direction: output
- name: "--counts_transcripts"
type: file
description: Expression counts on transcript level
example: $id.isoforms.results
direction: output
- name: "--stat"
type: file
description: RSEM statistics
example: $id.stat
direction: output
- name: "--logs"
type: file
description: RSEM logs
example: $id.log
direction: output
- name: "--bam_star"
type: file
description: BAM file generated by STAR (optional)
example: $id.STAR.genome.bam
direction: output
- name: "--bam_genome"
type: file
description: Genome BAM file (optional)
example: $id.genome.bam
direction: output
- name: "--bam_transcript"
type: file
description: Transcript BAM file (optional)
example: $id.transcript.bam
direction: output

resources:
- type: bash_script
path: script.sh

test_resources:
- type: bash_script
path: test.sh
- path: /testData/minimal_test/input_fastq/SRR6357070_1.fastq.gz
- path: /testData/minimal_test/input_fastq/SRR6357070_2.fastq.gz
- path: /testData/minimal_test/reference/rsem.tar.gz

# TODO: Install bowtie/bowtie2
engines:
- type: docker
image: ubuntu:22.04
setup:
- type: docker
run: |
apt-get update && \
apt-get install -y --no-install-recommends build-essential gcc g++ make wget zlib1g-dev unzip && \
apt-get clean && \
wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/2.7.11a.zip && \
unzip 2.7.11a.zip && \
cp STAR-2.7.11a/bin/Linux_x86_64_static/STAR /usr/local/bin && \
cd && \
wget --no-check-certificate https://github.com/deweylab/RSEM/archive/refs/tags/v1.3.3.zip && \
unzip v1.3.3.zip && \
cd RSEM-1.3.3 && \
make && \
make install
runners:
- type: executable
- type: nextflow
Empty file.
41 changes: 41 additions & 0 deletions src/rsem/rsem_calculate_expression/script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

set -eo pipefail

function clean_up {
rm -rf "$tmpdir"
}
trap clean_up EXIT

tmpdir=$(mktemp -d "$meta_temp_dir/$meta_functionality_name-XXXXXXXX")

if [ $par_strandedness == 'forward' ]; then
strandedness='--strandedness forward'
elif [ $par_strandedness == 'reverse' ]; then
strandedness='--strandedness reverse'
else
strandedness=''
fi

IFS="," read -ra input <<< $par_input

INDEX=$(find -L $meta_resources_dir/ -name "*.grp" | sed 's/\.grp$//')

rsem-calculate-expression \
${meta_cpus:+--num-theads $meta_cpus} \
$strandedness \
${par_paired:+--paired-end} \
$par_extra_args \
${input[*]} \
$INDEX \
$par_id

# Version
text="${meta_functionality_name}:
rsem: $(rsem-calculate-expression --version | sed -e 's/Current version: RSEM v//g')"
if [ -e "$par_versions" ]; then
echo "$text" >> "$par_versions"
mv "$par_versions" "$par_updated_versions"
else
echo "$text" > "$par_updated_versions"
fi
34 changes: 34 additions & 0 deletions src/rsem/rsem_calculate_expression/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

echo ">>> Testing $meta_functionality_name"

tar -xavf $meta_resources_dir/rsem.tar.gz

echo ">>> Calculating expression"
"$meta_executable" \
--id WT_REP1 \
--strandedness reverse \
--paired true \
--input "$meta_resources_dir/SRR6357070_1.fastq.gz,$meta_resources_dir/SRR6357070_2.fastq.gz" \
--index rsem \
--extra_args "--star --star-output-genome-bam --star-gzipped-read-file --estimate-rspd --seed 1" \
--counts_gene WT_REP1.genes.results \
--counts_transctips WT_REP1.isoforms.results \
--stat WT_REP1.stat \
--logs WT_REP1.log \
--bam_star WT_REP1.STAR.genome.bam \
--bam_genome WT_REP1.genome.bam \
--bam_transcript WT_REP1.transcript.bam

echo ">>> Checking whether output exists"
[ ! -f "WT_REP1.genes.results" ] && echo "Gene level expression counts file does not exist!" && exit 1
[ ! -s "WT_REP1.genes.results" ] && echo "Gene level expression counts file is empty!" && exit 1
[ ! -f "WT_REP1.isoforms.results" ] && echo "Transcript level expression counts file does not exist!" && exit 1
[ ! -s "WT_REP1.isoforms.results" ] && echo "Transcript level expression counts file is empty!" && exit 1
[ ! -f "WT_REP1.stat" ] && echo "Stats file does not exist!" && exit 1
[ ! -s "WT_REP1.stat" ] && echo "Stats file is empty!" && exit 1
[ ! -f "WT_REP1.log" ] && echo "Log file does not exist!" && exit 1
[ ! -s "WT_REP1.log" ] && echo "Log file is empty!" && exit 1

echo "All tests succeeded!"
exit 0
Binary file not shown.
Binary file not shown.
Binary file not shown.
5 changes: 5 additions & 0 deletions src/rsem/rsem_calculate_expression/test_data/script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

wget https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq3/testdata/GSE110004/SRR6357070_1.fastq.gz
wget https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq3/testdata/GSE110004/SRR6357070_2.fastq.gz

68 changes: 68 additions & 0 deletions src/rsem/rsem_merge_counts/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
name: "rsem_merge_counts"
namespace: "rsem"
info:
migration_info:
git_repo: https://github.com/nf-core/rnaseq.git
paths: [modules/local/rsem_merge_counts/main.nf]
last_sha: 311279532694ce7520164ce4d65a388c0cd11f60

description: |
Merge the transcript quantification results obtained from rsem calculate-expression across all samples.
argument_groups:
- name: "Input"
arguments:
- name: "--counts_gene"
type: file
description: Expression counts on gene level (genes)
- name: "--counts_transcripts"
type: file
description: Expression counts on transcript level (isoforms)
- name: "--versions"
type: file
must_exist: false

- name: "Output"
arguments:
- name: "--merged_gene_counts"
type: file
description: File containing gene counts across all samples.
default: rsem.merged.gene_counts.tsv
direction: output
- name: "--merged_gene_tpm"
type: file
description: File containing gene TPM across all samples.
default: rsem.merged.gene_tpm.tsv
direction: output
- name: "--merged_transcript_counts"
type: file
description: File containing transcript counts across all samples.
default: rsem.merged.transcript_counts.tsv
direction: output
- name: "--merged_transcript_tpm"
type: file
description: File containing transcript TPM across all samples.
default: rsem.merged.transcript_tpm.tsv
direction: output
- name: "--updated_versions"
type: file
default: versions.yml
direction: output

resources:
- type: bash_script
path: script.sh

test_resources:
- type: bash_script
path: test.sh
# - path: /testData/minimal_test/input_fastq/SRR6357070_1.fastq.gz
# - path: /testData/minimal_test/input_fastq/SRR6357070_2.fastq.gz

engines:
- type: docker
image: ubuntu:22.04

runners:
- type: executable
- type: nextflow
3 changes: 3 additions & 0 deletions src/rsem/rsem_merge_counts/help.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```bash
rsem-merge-counts --help
```
38 changes: 38 additions & 0 deletions src/rsem/rsem_merge_counts/script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash

set -ep pipefail

mkdir -p tmp/genes
# cut -f 1,2 `ls $par_count_genes/*` | head -n 1` > gene_ids.txt
for file_id in ${par_count_genes[*]}; do
samplename=$(basename $file_id | sed s/\\.genes.results\$//g)
echo $samplename > tmp/genes/${samplename}.counts.txt
cut -f 5 ${file_id} | tail -n+2 >> tmp/genes/${samplename}.counts.txt
echo $samplename > tmp/genes/${samplename}.tpm.txt
cut -f 6 ${file_id} | tail -n+2 >> tmp/genes/${samplename}.tpm.txt
done

mkdir -p tmp/isoforms
# cut -f 1,2 `ls $par_counts_transcripts/*` | head -n 1` > transcript_ids.txt
for file_id in ${par_counts_transcripts[*]}; do
samplename=$(basename $file_id | sed s/\\.isoforms.results\$//g)
echo $samplename > tmp/isoforms/${samplename}.counts.txt
cut -f 5 ${file_id} | tail -n+2 >> tmp/isoforms/${samplename}.counts.txt
echo $samplename > tmp/isoforms/${samplename}.tpm.txt
cut -f 6 ${file_id} | tail -n+2 >> tmp/isoforms/${samplename}.tpm.txt
done

paste gene_ids.txt tmp/genes/*.counts.txt > $par_merged_gene_counts
paste gene_ids.txt tmp/genes/*.tpm.txt > $par_merged_gene_tpm
paste transcript_ids.txt tmp/isoforms/*.counts.txt > $par_merged_transcript_counts
paste transcript_ids.txt tmp/isoforms/*.tpm.txt > $par_merged_transcript_tpm

# Version
text="${meta_functionality_name}:
sed: $(echo $(sed --version 2>&1) | grep -oP 'sed \(GNU sed\) \K\d+\.\d+')"
if [ -e "$par_versions" ]; then
echo "$text" >> "$par_versions"
mv "$par_versions" "$par_updated_versions"
else
echo "$text" > "$par_updated_versions"
fi
Binary file not shown.
Binary file not shown.
6 changes: 6 additions & 0 deletions src/rsem/rsem_merge_counts/test_data/script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

wget https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq3/testdata/GSE110004/SRR6357070_1.fastq.gz
wget https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq3/testdata/GSE110004/SRR6357070_2.fastq.gz
wget https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/rsem.tar.gz

72 changes: 72 additions & 0 deletions src/rsem/rsem_prepare_reference/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
name: "rsem_prepare_reference"
namespace: "rsem"
info:
migration_info:
git_repo: https://github.com/nf-core/rnaseq.git
paths: [modules/nf-core/rsem/preparereference/main.nf, modules/nf-core/rsem/preparereference/meta.yml]
last_sha: 92b2a7857de1dda9d1c19a088941fc81e2976ff7

description: |
Prepare a reference genome for RSEM.
argument_groups:
- name: "Input"
arguments:
- name: "--fasta"
type: file
description: Genome fasta file
- name: "--gtf"
type: file
description: GTF file
- name: "--star"
type: boolean
- name: "--versions"
type: file
must_exist: false

- name: "Output"
arguments:
- name: "--rsem"
type: file
direction: output
description: RSEM index directory.
- name: "--transcript_fasta"
type: file
direction: output
description: Fasta file of transcripts
- name: "--updated_versions"
type: file
default: versions.yml
direction: output

resources:
- type: bash_script
path: script.sh

test_resources:
- type: bash_script
path: test.sh
- path: /testData/minimal_test/reference/genome.fasta
- path: /testData/minimal_test/reference/genes.gtf.gz

engines:
- type: docker
image: ubuntu:22.04
setup:
- type: docker
run: |
apt-get update && \
apt-get install -y --no-install-recommends build-essential gcc g++ make wget zlib1g-dev unzip && \
apt-get clean && \
wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/2.7.11a.zip && \
unzip 2.7.11a.zip && \
cp STAR-2.7.11a/bin/Linux_x86_64_static/STAR /usr/local/bin && \
cd && \
wget --no-check-certificate https://github.com/deweylab/RSEM/archive/refs/tags/v1.3.3.zip && \
unzip v1.3.3.zip && \
cd RSEM-1.3.3 && \
make && \
make install
runners:
- type: executable
- type: nextflow
Loading

0 comments on commit dc275da

Please sign in to comment.