diff --git a/CHANGELOG.md b/CHANGELOG.md index 07a83c15..9bfb5606 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ * `bd_rhapsody/bd_rhapsody_sequence_analysis`: BD Rhapsody Sequence Analysis CWL pipeline (PR #96). +* `rsem/rsem_calculate_expression`: Calculate expression levels (PR #93). + ## MINOR CHANGES * Upgrade to Viash 0.9.0. diff --git a/src/rsem/rsem_calculate_expression/config.vsh.yaml b/src/rsem/rsem_calculate_expression/config.vsh.yaml new file mode 100644 index 00000000..2cd950cb --- /dev/null +++ b/src/rsem/rsem_calculate_expression/config.vsh.yaml @@ -0,0 +1,479 @@ +name: "rsem_calculate_expression" +namespace: "rsem" +description: | + Calculate expression with RSEM. +keywords: [Transcriptome, Index, Alignment, RSEM] +links: + homepage: https://deweylab.github.io/RSEM/ + documentation: https://deweylab.github.io/RSEM/rsem-calculate-expression.html + repository: https://github.com/deweylab/RSEM +references: + doi: https://doi.org/10.1186/1471-2105-12-323 +license: GPL-3.0 + + +argument_groups: +- name: "Input" + arguments: + - name: "--id" + type: string + description: Sample ID. + - name: "--strandedness" + type: string + description: Sample strand-specificity. Must be one of unstranded, forward, reverse + choices: [forward, reverse, unstranded] + - name: "--paired" + type: boolean_true + description: Paired-end reads or not? + - name: "--input" + type: file + description: Input reads for quantification. + multiple: true + - name: "--index" + type: file + must_exist: false + description: RSEM index. + - name: "--extra_args" + type: string + description: Extra rsem-calculate-expression arguments in addition to the examples. + +- name: "Output" + arguments: + - name: "--counts_gene" + type: file + description: Expression counts on gene level + example: $id.genes.results + direction: output + - name: "--counts_transcripts" + type: file + description: Expression counts on transcript level + example: $id.isoforms.results + direction: output + - name: "--stat" + type: file + description: RSEM statistics + example: $id.stat + direction: output + - name: "--logs" + type: file + description: RSEM logs + example: $id.log + direction: output + - name: "--bam_star" + type: file + description: BAM file generated by STAR (optional) + example: $id.STAR.genome.bam + direction: output + - name: "--bam_genome" + type: file + description: Genome BAM file (optional) + example: $id.genome.bam + direction: output + - name: "--bam_transcript" + type: file + description: Transcript BAM file (optional) + example: $id.transcript.bam + direction: output + - name: "--sort_bam_by_read_name" + type: boolean_true + description: | + Sort BAM file aligned under transcript coordidate by read name. Setting this option on will produce + deterministic maximum likelihood estimations from independent runs. Note that sorting will take long + time and lots of memory. + - name: "--no_bam_output" + type: boolean_true + description: Do not output any BAM file. + - name: "--sampling_for_bam" + type: boolean_true + description: | + When RSEM generates a BAM file, instead of outputting all alignments a read has with their posterior + probabilities, one alignment is sampled according to the posterior probabilities. The sampling procedure + includes the alignment to the "noise" transcript, which does not appear in the BAM file. Only the + sampled alignment has a weight of 1. All other alignments have weight 0. If the "noise" transcript is + sampled, all alignments appeared in the BAM file should have weight 0. + - name: "--output_genome_bam" + type: boolean_true + description: | + Generate a BAM file, 'sample_name.genome.bam', with alignments mapped to genomic coordinates and + annotated with their posterior probabilities. In addition, RSEM will call samtools (included in RSEM + package) to sort and index the bam file. 'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' + will be generated. + - name: "--sort_bam_by_coordinate" + type: boolean_true + description: | + Sort RSEM generated transcript and genome BAM files by coordinates and build associated indices. + +- name: "Basic Options" + arguments: + - name: "--no_qualities" + type: boolean_true + description: Input reads do not contain quality scores. + - name: "--alignments" + type: boolean_true + description: | + Input file contains alignments in SAM/BAM/CRAM format. The exact file format will be determined + automatically. + - name: "--fai" + type: file + description: | + If the header section of input alignment file does not contain reference sequence information, + this option should be turned on. is a FAI format file containing each reference sequence's + name and length. Please refer to the SAM official website for the details of FAI format. + - name: "--bowtie2" + type: boolean_true + description: | + Use Bowtie 2 instead of Bowtie to align reads. Since currently RSEM does not handle indel, local + and discordant alignments, the Bowtie2 parameters are set in a way to avoid those alignments. In + particular, we use options '--sensitive --dpad 0 --gbar 99999999 --mp 1,1 --np 1 --score_min L,0,-0.1' + by default. The last parameter of '--score_min', '-0.1', is the negative of maximum mismatch rate. + This rate can be set by option '--bowtie2_mismatch_rate'. If reads are paired-end, we additionally + use options '--no_mixed' and '--no_discordant'. + - name: "--star" + type: boolean_true + description: | + Use STAR to align reads. Alignment parameters are from ENCODE3's STAR-RSEM pipeline. To save + computational time and memory resources, STAR's Output BAM file is unsorted. It is stored in RSEM's + temporary directory with name as 'sample_name.bam'. Each STAR job will have its own private copy of + the genome in memory. + - name: "--hisat2_hca" + type: boolean_true + description: | + Use HISAT2 to align reads to the transcriptome according to Human Cell Atlast. + - name: "--append_names" + type: boolean_true + description: | + If gene_name/transcript_name is available, append it to the end of gene_id/transcript_id (separated + by '_') in files 'sample_name.isoforms.results' and 'sample_name.genes.results'. + - name: "--seed" + type: integer + description: | + Set the seed for the random number generators used in calculating posterior mean estimates and + credibility intervals. The seed must be a non-negative 32 bit integer. + - name: "--single_cell_prior" + type: boolean_true + description: | + By default, RSEM uses Dirichlet(1) as the prior to calculate posterior mean estimates and credibility + intervals. However, much less genes are expressed in single cell RNA-Seq data. Thus, if you want to + compute posterior mean estimates and/or credibility intervals and you have single-cell RNA-Seq data, + you are recommended to turn on this option. Then RSEM will use Dirichlet(0.1) as the prior which + encourage the sparsity of the expression levels. + - name: "--calc_pme" + type: boolean_true + description: Run RSEM's collapsed Gibbs sampler to calculate posterior mean estimates. + - name: "--calc_ci" + type: boolean_true + description: | + Calculate 95% credibility intervals and posterior mean estimates. The credibility level can be + changed by setting '--ci_credibility_level'. + - name: "--quiet" + alternatives: "-q" + type: boolean_true + description: Suppress the output of logging information. + +- name: "Aligner Options" + arguments: + - name: "--seed_length" + type: integer + description: | + Seed length used by the read aligner. Providing the correct value is important for RSEM. If RSEM + runs Bowtie, it uses this value for Bowtie's seed length parameter. Any read with its or at least + one of its mates' (for paired-end reads) length less than this value will be ignored. If the + references are not added poly(A) tails, the minimum allowed value is 5, otherwise, the minimum + allowed value is 25. Note that this script will only check if the value >= 5 and give a warning + message if the value < 25 but >= 5. (Default: 25) + example: 25 + - name: "--phred64_quals" + type: boolean_true + description: | + Input quality scores are encoded as Phred+64 (default for GA Pipeline ver. >= 1.3). This option is + used by Bowtie, Bowtie 2 and HISAT2. Otherwise, quality score will be encoded as Phred+33. (Default: false) + - name: "--solexa_quals" + type: boolean_true + description: | + Input quality scores are solexa encoded (from GA Pipeline ver. < 1.3). This option is used by + Bowtie, Bowtie 2 and HISAT2. Otherwise, quality score will be encoded as Phred+33. (Default: false) + - name: "--bowtie_n" + type: integer + description: | + (Bowtie parameter) max # of mismatches in the seed. (Range: 0-3, Default: 2) + choices: [0, 1, 2, 3] + example: 2 + - name: "--bowtie_e" + type: integer + description: | + (Bowtie parameter) max sum of mismatch quality scores across the alignment. (Default: 99999999) + example: 99999999 + - name: "--bowtie_m" + type: integer + description: | + (Bowtie parameter) suppress all alignments for a read if > valid alignments exist. (Default: 200) + example: 200 + - name: "--bowtie_chunkmbs" + type: integer + description: | + (Bowtie parameter) memory allocated for best first alignment calculation (Default: 0 - use Bowtie's default) + example: 0 + - name: "--bowtie2_mismatch_rate" + type: double + description: | + (Bowtie 2 parameter) The maximum mismatch rate allowed. (Default: 0.1) + example: 0.1 + - name: "--bowtie2_k" + type: integer + description: | + (Bowtie 2 parameter) Find up to alignments per read. (Default: 200) + example: 200 + - name: "--bowtie2_sensitivity_level" + type: string + description: | + (Bowtie 2 parameter) Set Bowtie 2's preset options in --end-to-end mode. This option controls how + hard Bowtie 2 tries to find alignments. must be one of "very_fast", "fast", "sensitive" + and "very_sensitive". The four candidates correspond to Bowtie 2's "--very-fast", "--fast", + "--sensitive" and "--very-sensitive" options. (Default: "sensitive" - use Bowtie 2's default) + choices: ["very_fast", "fast", "sensitive", "very_sensitive"] + example: sensitive + - name: "--star_gzipped_read_file" + type: boolean_true + description: | + Input read file(s) is compressed by gzip. (Default: false) + - name: "--star_bzipped_read_file" + type: boolean_true + description: | + Input read file(s) is compressed by bzip2. (Default: false) + - name: "--star_output_genome_bam" + type: boolean_true + description: | + Save the BAM file from STAR alignment under genomic coordinate to 'sample_name.STAR.genome.bam'. + This file is NOT sorted by genomic coordinate. In this file, according to STAR's manual, 'paired + ends of an alignment are always adjacent, and multiple alignments of a read are adjacent as well'. + (Default: false) + +- name: "Advanced Options" + arguments: + - name: "--tag" + type: string + description: | + The name of the optional field used in the SAM input for identifying a read with too many valid + alignments. The field should have the format :i:, where a bigger than 0 + indicates a read with too many alignments. (Default: "") + example: "" + - name: "--fragment_length_min" + type: integer + description: | + Minimum read/insert length allowed. This is also the value for the Bowtie/Bowtie2 -I option. + (Default: 1) + example: 1 + - name: "--fragment_length_max" + type: integer + description: | + Maximum read/insert length allowed. This is also the value for the Bowtie/Bowtie 2 -X option. + (Default: 1000) + example: 1000 + - name: "--fragment_length_mean" + type: integer + description: | + (single-end data only) The mean of the fragment length distribution, which is assumed to be a + Gaussian. (Default: -1, which disables use of the fragment length distribution) + example: -1 + - name: "--gragment_length_sd" + type: double + description: | + (single-end data only) The standard deviation of the fragment length distribution, which is + assumed to be a Gaussian. (Default: 0, which assumes that all fragments are of the same length, + given by the rounded value of --fragment_length_mean). + example: 0 + - name: "--estimate_rspd" + type: boolean_true + description: | + Set this option if you want to estimate the read start position distribution (RSPD) from data. + Otherwise, RSEM will use a uniform RSPD. + - name: "--num_rspd_bins" + type: integer + description: | + Number of bins in the RSPD. Only relevant when '--estimate_rspd' is specified. Use of the default + setting is recommended. (Default: 20) + example: 20 + - name: "--gibbs_burnin" + type: integer + description: | + The number of burn-in rounds for RSEM's Gibbs sampler. Each round passes over the entire data set + once. If RSEM can use multiple threads, multiple Gibbs samplers will start at the same time and all + samplers share the same burn-in number. (Default: 200) + example: 200 + - name: "--gibbs_number_of_samples" + type: integer + description: | + The total number of count vectors RSEM will collect from its Gibbs samplers. (Default: 1000) + example: 1000 + - name: "--gibbs_sampling_gap" + type: integer + description: | + The number of rounds between two succinct count vectors RSEM collects. If the count vector after + round N is collected, the count vector after round N + will also be collected. (Default: 1) + example: 1 + - name: "--ci_credibility_level" + type: double + description: | + The credibility level for credibility intervals. (Default: 0.95) + example: 0.95 + - name: "--ci_number_of_samples_per_count_vector" + type: integer + description: | + The number of read generating probability vectors sampled per sampled count vector. The crebility + intervals are calculated by first sampling P(C | D) and then sampling P(Theta | C) for each sampled + count vector. This option controls how many Theta vectors are sampled per sampled count vector. + (Default: 50) + example: 50 + - name: "--keep_intermediate_files" + type: boolean_true + description: | + Keep temporary files generated by RSEM. RSEM creates a temporary directory, 'sample_name.temp', + into which it puts all intermediate output files. If this directory already exists, RSEM overwrites + all files generated by previous RSEM runs inside of it. By default, after RSEM finishes, the + temporary directory is deleted. Set this option to prevent the deletion of this directory and the + intermediate files inside of it. + - name: "--temporary_folder" + type: string + description: | + Set where to put the temporary files generated by RSEM. If the folder specified does not exist, + RSEM will try to create it. (Default: sample_name.temp) + example: sample_name.temp + - name: "--time" + type: boolean_true + description: | + Output time consumed by each step of RSEM to 'sample_name.time'. + +- name: "Prior-Enhanced RSEM Options" + arguments: + - name: "--run_pRSEM" + type: boolean_true + description: | + Running prior-enhanced RSEM (pRSEM). Prior parameters, i.e. isoform's initial pseudo-count for + RSEM's Gibbs sampling, will be learned from input RNA-seq data and an external data set. When pRSEM + needs and only needs ChIP-seq peak information to partition isoforms (e.g. in pRSEM's default + partition model), either ChIP-seq peak file (with the '--chipseq_peak_file' option) or ChIP-seq + FASTQ files for target and input and the path for Bowtie executables are required (with the + '--chipseq_target_read_files ', '--chipseq_control_read_files ', and '--bowtie_path + options), otherwise, ChIP-seq FASTQ files for target and control and the path to Bowtie + executables are required. + - name: "--chipseq_peak_file" + type: file + must_exist: true + description: | + Full path to a ChIP-seq peak file in ENCODE's narrowPeak, i.e. BED6+4, format. This file is used + when running prior-enhanced RSEM in the default two-partition model. It partitions isoforms by + whether they have ChIP-seq overlapping with their transcription start site region or not. Each + partition will have its own prior parameter learned from a training set. This file can be either + gzipped or ungzipped. + - name: "--chipseq_target_read_files" + type: file + must_exist: true + description: | + Comma-separated full path of FASTQ read file(s) for ChIP-seq target. This option is used when running + prior-enhanced RSEM. It provides information to calculate ChIP-seq peaks and signals. The file(s) + can be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options '--bowtie_path ' + and '--chipseq_control_read_files ' must be defined when this option is specified. + - name: "--chipseq_control_read_files" + type: file + must_exist: true + description: | + Comma-separated full path of FASTQ read file(s) for ChIP-seq conrol. This option is used when running + prior-enhanced RSEM. It provides information to call ChIP-seq peaks. The file(s) can be either + ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options '--bowtie_path ' and + '--chipseq_target_read_files ' must be defined when this option is specified. + - name: "--chipseq_read_files_multi_targets" + type: file + must_exist: true + description: | + Comma-separated full path of FASTQ read files for multiple ChIP-seq targets. This option is used when + running prior-enhanced RSEM, where prior is learned from multiple complementary data sets. It provides + information to calculate ChIP-seq signals. All files can be either ungzipped or gzipped with a suffix + '.gz' or '.gzip'. When this option is specified, the option '--bowtie_path ' must be defined and + the option '--partition_model ' will be set to 'cmb_lgt' automatically. + - name: "--chipseq_bed_files_multi_targets" + type: file + must_exist: true + description: | + Comma-separated full path of BED files for multiple ChIP-seq targets. This option is used when running + prior-enhanced RSEM, where prior is learned from multiple complementary data sets. It provides information + of ChIP-seq signals and must have at least the first six BED columns. All files can be either ungzipped + or gzipped with a suffix '.gz' or '.gzip'. When this option is specified, the option '--partition_model + ' will be set to 'cmb_lgt' automatically. + - name: "--cap_stacked_chipseq_reads" + type: boolean_true + description: | + Keep a maximum number of ChIP-seq reads that aligned to the same genomic interval. This option is used + when running prior-enhanced RSEM, where prior is learned from multiple complementary data sets. This + option is only in use when either '--chipseq_read_files_multi_targets ' or + '--chipseq_bed_files_multi_targets ' is specified. + - name: "--n_max_stacked_chipseq_reads" + type: integer + description: | + The maximum number of stacked ChIP-seq reads to keep. This option is used when running prior-enhanced + RSEM, where prior is learned from multiple complementary data sets. This option is only in use when the + option '--cap_stacked_chipseq_reads' is set. + - name: "--partition_model" + type: string + description: | + A keyword to specify the partition model used by prior-enhanced RSEM. It must be one of the following + keywords: + * pk + * pk_lgtnopk + * lm3, lm4, lm5, or lm6 + * nopk_lm2pk, nopk_lm3pk, nopk_lm4pk, or nopk_lm5pk + * pk_lm2nopk, pk_lm3nopk, pk_lm4nopk, or pk_lm5nopk + * cmb_lgt + Parameters for all the above models are learned from a training set. For detailed explanations, please + see prior-enhanced RSEM's paper. (Default: 'pk') + example: "pk" + + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: ubuntu:22.04 + setup: + - type: apt + packages: + - build-essential + - gcc + - g++ + - make + - wget + - zlib1g-dev + - unzip + - type: docker + env: + - STAR_VERSION=2.7.11b + - RSEM_VERSION=1.3.3 + run: | + apt-get update && \ + apt-get clean && \ + wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/2.7.11a.zip && \ + unzip 2.7.11a.zip && \ + cp STAR-2.7.11a/bin/Linux_x86_64_static/STAR /usr/local/bin && \ + cd && \ + wget --no-check-certificate https://github.com/deweylab/RSEM/archive/refs/tags/v1.3.3.zip && \ + unzip v1.3.3.zip && \ + cd RSEM-1.3.3 && \ + make && \ + make install + - type: docker + run: | + echo "RSEM: `rsem-calculate-expression --version | sed -e 's/Current version: RSEM v//g'`" > /var/software_versions.txt && \ + echo "STAR: `STAR --version`" >> /var/software_versions.txt && \ + echo "bowtie2: `bowtie2 --version | grep -oP '\d+\.\d+\.\d+'`" >> /var/software_versions.txt && \ + echo "bowtie: `bowtie --version | grep -oP 'bowtie-align-s version \K\d+\.\d+\.\d+'`" >> /var/software_versions.txt && \ + echo "HISAT2: `hisat2 --version | grep -oP 'hisat2-align-s version \K\d+\.\d+\.\d+'`" >> /var/software_versions.txt +runners: + - type: executable + - type: nextflow + + diff --git a/src/rsem/rsem_calculate_expression/help.txt b/src/rsem/rsem_calculate_expression/help.txt new file mode 100644 index 00000000..edfa3333 --- /dev/null +++ b/src/rsem/rsem_calculate_expression/help.txt @@ -0,0 +1,1002 @@ +NAME + rsem-calculate-expression - Estimate gene and isoform expression from + RNA-Seq data. + +SYNOPSIS + rsem-calculate-expression [options] upstream_read_file(s) reference_name sample_name + rsem-calculate-expression [options] --paired-end upstream_read_file(s) downstream_read_file(s) reference_name sample_name + rsem-calculate-expression [options] --alignments [--paired-end] input reference_name sample_name + +ARGUMENTS + upstream_read_files(s) + Comma-separated list of files containing single-end reads or + upstream reads for paired-end data. By default, these files are + assumed to be in FASTQ format. If the --no-qualities option is + specified, then FASTA format is expected. + + downstream_read_file(s) + Comma-separated list of files containing downstream reads which are + paired with the upstream reads. By default, these files are assumed + to be in FASTQ format. If the --no-qualities option is specified, + then FASTA format is expected. + + input + SAM/BAM/CRAM formatted input file. If "-" is specified for the + filename, the input is instead assumed to come from standard input. + RSEM requires all alignments of the same read group together. For + paired-end reads, RSEM also requires the two mates of any alignment + be adjacent. In addition, RSEM does not allow the SEQ and QUAL + fields to be empty. See Description section for how to make input + file obey RSEM's requirements. + + reference_name + The name of the reference used. The user must have run + 'rsem-prepare-reference' with this reference_name before running + this program. + + sample_name + The name of the sample analyzed. All output files are prefixed by + this name (e.g., sample_name.genes.results) + +BASIC OPTIONS + --paired-end + Input reads are paired-end reads. (Default: off) + + --no-qualities + Input reads do not contain quality scores. (Default: off) + + --strandedness + This option defines the strandedness of the RNA-Seq reads. It + recognizes three values: 'none', 'forward', and 'reverse'. 'none' + refers to non-strand-specific protocols. 'forward' means all + (upstream) reads are derived from the forward strand. 'reverse' + means all (upstream) reads are derived from the reverse strand. If + 'forward'/'reverse' is set, the '--norc'/'--nofw' Bowtie/Bowtie 2 + option will also be enabled to avoid aligning reads to the opposite + strand. For Illumina TruSeq Stranded protocols, please use + 'reverse'. (Default: 'none') + + -p/--num-threads + Number of threads to use. Both Bowtie/Bowtie2, expression estimation + and 'samtools sort' will use this many threads. (Default: 1) + + --alignments + Input file contains alignments in SAM/BAM/CRAM format. The exact + file format will be determined automatically. (Default: off) + + --fai + If the header section of input alignment file does not contain + reference sequence information, this option should be turned on. + is a FAI format file containing each reference sequence's + name and length. Please refer to the SAM official website for the + details of FAI format. (Default: off) + + --bowtie2 + Use Bowtie 2 instead of Bowtie to align reads. Since currently RSEM + does not handle indel, local and discordant alignments, the Bowtie2 + parameters are set in a way to avoid those alignments. In + particular, we use options '--sensitive --dpad 0 --gbar 99999999 + --mp 1,1 --np 1 --score-min L,0,-0.1' by default. The last parameter + of '--score-min', '-0.1', is the negative of maximum mismatch rate. + This rate can be set by option '--bowtie2-mismatch-rate'. If reads + are paired-end, we additionally use options '--no-mixed' and + '--no-discordant'. (Default: off) + + --star + Use STAR to align reads. Alignment parameters are from ENCODE3's + STAR-RSEM pipeline. To save computational time and memory resources, + STAR's Output BAM file is unsorted. It is stored in RSEM's temporary + directory with name as 'sample_name.bam'. Each STAR job will have + its own private copy of the genome in memory. (Default: off) + + --hisat2-hca + Use HISAT2 to align reads to the transcriptome according to Human + Cell Atlast SMART-Seq2 pipeline. In particular, we use HISAT + parameters "-k 10 --secondary --rg-id=$sampleToken --rg + SM:$sampleToken --rg LB:$sampleToken --rg PL:ILLUMINA --rg + PU:$sampleToken --new-summary --summary-file $sampleName.log + --met-file $sampleName.hisat2.met.txt --met 5 --mp 1,1 --np 1 + --score-min L,0,-0.1 --rdg 99999999,99999999 --rfg 99999999,99999999 + --no-spliced-alignment --no-softclip --seed 12345". If inputs are + paired-end reads, we additionally use parameters "--no-mixed + --no-discordant". (Default: off) + + --append-names + If gene_name/transcript_name is available, append it to the end of + gene_id/transcript_id (separated by '_') in files + 'sample_name.isoforms.results' and 'sample_name.genes.results'. + (Default: off) + + --seed + Set the seed for the random number generators used in calculating + posterior mean estimates and credibility intervals. The seed must be + a non-negative 32 bit integer. (Default: off) + + --single-cell-prior + By default, RSEM uses Dirichlet(1) as the prior to calculate + posterior mean estimates and credibility intervals. However, much + less genes are expressed in single cell RNA-Seq data. Thus, if you + want to compute posterior mean estimates and/or credibility + intervals and you have single-cell RNA-Seq data, you are recommended + to turn on this option. Then RSEM will use Dirichlet(0.1) as the + prior which encourage the sparsity of the expression levels. + (Default: off) + + --calc-pme + Run RSEM's collapsed Gibbs sampler to calculate posterior mean + estimates. (Default: off) + + --calc-ci + Calculate 95% credibility intervals and posterior mean estimates. + The credibility level can be changed by setting + '--ci-credibility-level'. (Default: off) + + -q/--quiet + Suppress the output of logging information. (Default: off) + + -h/--help + Show help information. + + --version + Show version information. + +OUTPUT OPTIONS + --sort-bam-by-read-name + Sort BAM file aligned under transcript coordidate by read name. + Setting this option on will produce deterministic maximum likelihood + estimations from independent runs. Note that sorting will take long + time and lots of memory. (Default: off) + + --no-bam-output + Do not output any BAM file. (Default: off) + + --sampling-for-bam + When RSEM generates a BAM file, instead of outputting all alignments + a read has with their posterior probabilities, one alignment is + sampled according to the posterior probabilities. The sampling + procedure includes the alignment to the "noise" transcript, which + does not appear in the BAM file. Only the sampled alignment has a + weight of 1. All other alignments have weight 0. If the "noise" + transcript is sampled, all alignments appeared in the BAM file + should have weight 0. (Default: off) + + --output-genome-bam + Generate a BAM file, 'sample_name.genome.bam', with alignments + mapped to genomic coordinates and annotated with their posterior + probabilities. In addition, RSEM will call samtools (included in + RSEM package) to sort and index the bam file. + 'sample_name.genome.sorted.bam' and + 'sample_name.genome.sorted.bam.bai' will be generated. (Default: + off) + + --sort-bam-by-coordinate + Sort RSEM generated transcript and genome BAM files by coordinates + and build associated indices. (Default: off) + + --sort-bam-memory-per-thread + Set the maximum memory per thread that can be used by 'samtools + sort'. represents the memory and accepts suffices 'K/M/G'. + RSEM will pass to the '-m' option of 'samtools sort'. Note + that the default used here is different from the default used by + samtools. (Default: 1G) + +ALIGNER OPTIONS + --seed-length + Seed length used by the read aligner. Providing the correct value is + important for RSEM. If RSEM runs Bowtie, it uses this value for + Bowtie's seed length parameter. Any read with its or at least one of + its mates' (for paired-end reads) length less than this value will + be ignored. If the references are not added poly(A) tails, the + minimum allowed value is 5, otherwise, the minimum allowed value is + 25. Note that this script will only check if the value >= 5 and give + a warning message if the value < 25 but >= 5. (Default: 25) + + --phred33-quals + Input quality scores are encoded as Phred+33. This option is used by + Bowtie, Bowtie 2 and HISAT2. (Default: on) + + --phred64-quals + Input quality scores are encoded as Phred+64 (default for GA + Pipeline ver. >= 1.3). This option is used by Bowtie, Bowtie 2 and + HISAT2. (Default: off) + + --solexa-quals + Input quality scores are solexa encoded (from GA Pipeline ver. < + 1.3). This option is used by Bowtie, Bowtie 2 and HISAT2. (Default: + off) + + --bowtie-path + The path to the Bowtie executables. (Default: the path to the Bowtie + executables is assumed to be in the user's PATH environment + variable) + + --bowtie-n + (Bowtie parameter) max # of mismatches in the seed. (Range: 0-3, + Default: 2) + + --bowtie-e + (Bowtie parameter) max sum of mismatch quality scores across the + alignment. (Default: 99999999) + + --bowtie-m + (Bowtie parameter) suppress all alignments for a read if > + valid alignments exist. (Default: 200) + + --bowtie-chunkmbs + (Bowtie parameter) memory allocated for best first alignment + calculation (Default: 0 - use Bowtie's default) + + --bowtie2-path + (Bowtie 2 parameter) The path to the Bowtie 2 executables. (Default: + the path to the Bowtie 2 executables is assumed to be in the user's + PATH environment variable) + + --bowtie2-mismatch-rate + (Bowtie 2 parameter) The maximum mismatch rate allowed. (Default: + 0.1) + + --bowtie2-k + (Bowtie 2 parameter) Find up to alignments per read. (Default: + 200) + + --bowtie2-sensitivity-level + (Bowtie 2 parameter) Set Bowtie 2's preset options in --end-to-end + mode. This option controls how hard Bowtie 2 tries to find + alignments. must be one of "very_fast", "fast", "sensitive" + and "very_sensitive". The four candidates correspond to Bowtie 2's + "--very-fast", "--fast", "--sensitive" and "--very-sensitive" + options. (Default: "sensitive" - use Bowtie 2's default) + + --star-path + The path to STAR's executable. (Default: the path to STAR executable + is assumed to be in user's PATH environment variable) + + --star-gzipped-read-file + (STAR parameter) Input read file(s) is compressed by gzip. (Default: + off) + + --star-bzipped-read-file + (STAR parameter) Input read file(s) is compressed by bzip2. + (Default: off) + + --star-output-genome-bam + (STAR parameter) Save the BAM file from STAR alignment under genomic + coordinate to 'sample_name.STAR.genome.bam'. This file is NOT sorted + by genomic coordinate. In this file, according to STAR's manual, + 'paired ends of an alignment are always adjacent, and multiple + alignments of a read are adjacent as well'. (Default: off) + + --hisat2-path + The path to HISAT2's executable. (Default: the path to HISAT2 + executable is assumed to be in user's PATH environment variable) + +ADVANCED OPTIONS + --tag + The name of the optional field used in the SAM input for identifying + a read with too many valid alignments. The field should have the + format :i:, where a bigger than 0 indicates + a read with too many alignments. (Default: "") + + --fragment-length-min + Minimum read/insert length allowed. This is also the value for the + Bowtie/Bowtie2 -I option. (Default: 1) + + --fragment-length-max + Maximum read/insert length allowed. This is also the value for the + Bowtie/Bowtie 2 -X option. (Default: 1000) + + --fragment-length-mean + (single-end data only) The mean of the fragment length distribution, + which is assumed to be a Gaussian. (Default: -1, which disables use + of the fragment length distribution) + + --fragment-length-sd + (single-end data only) The standard deviation of the fragment length + distribution, which is assumed to be a Gaussian. (Default: 0, which + assumes that all fragments are of the same length, given by the + rounded value of --fragment-length-mean) + + --estimate-rspd + Set this option if you want to estimate the read start position + distribution (RSPD) from data. Otherwise, RSEM will use a uniform + RSPD. (Default: off) + + --num-rspd-bins + Number of bins in the RSPD. Only relevant when '--estimate-rspd' is + specified. Use of the default setting is recommended. (Default: 20) + + --gibbs-burnin + The number of burn-in rounds for RSEM's Gibbs sampler. Each round + passes over the entire data set once. If RSEM can use multiple + threads, multiple Gibbs samplers will start at the same time and all + samplers share the same burn-in number. (Default: 200) + + --gibbs-number-of-samples + The total number of count vectors RSEM will collect from its Gibbs + samplers. (Default: 1000) + + --gibbs-sampling-gap + The number of rounds between two succinct count vectors RSEM + collects. If the count vector after round N is collected, the count + vector after round N + will also be collected. (Default: 1) + + --ci-credibility-level + The credibility level for credibility intervals. (Default: 0.95) + + --ci-memory + Maximum size (in memory, MB) of the auxiliary buffer used for + computing credibility intervals (CI). (Default: 1024) + + --ci-number-of-samples-per-count-vector + The number of read generating probability vectors sampled per + sampled count vector. The crebility intervals are calculated by + first sampling P(C | D) and then sampling P(Theta | C) for each + sampled count vector. This option controls how many Theta vectors + are sampled per sampled count vector. (Default: 50) + + --keep-intermediate-files + Keep temporary files generated by RSEM. RSEM creates a temporary + directory, 'sample_name.temp', into which it puts all intermediate + output files. If this directory already exists, RSEM overwrites all + files generated by previous RSEM runs inside of it. By default, + after RSEM finishes, the temporary directory is deleted. Set this + option to prevent the deletion of this directory and the + intermediate files inside of it. (Default: off) + + --temporary-folder + Set where to put the temporary files generated by RSEM. If the + folder specified does not exist, RSEM will try to create it. + (Default: sample_name.temp) + + --time + Output time consumed by each step of RSEM to 'sample_name.time'. + (Default: off) + +PRIOR-ENHANCED RSEM OPTIONS + --run-pRSEM + Running prior-enhanced RSEM (pRSEM). Prior parameters, i.e. + isoform's initial pseudo-count for RSEM's Gibbs sampling, will be + learned from input RNA-seq data and an external data set. When pRSEM + needs and only needs ChIP-seq peak information to partition isoforms + (e.g. in pRSEM's default partition model), either ChIP-seq peak file + (with the '--chipseq-peak-file' option) or ChIP-seq FASTQ files for + target and input and the path for Bowtie executables are required + (with the '--chipseq-target-read-files ', + '--chipseq-control-read-files ', and '--bowtie-path + options), otherwise, ChIP-seq FASTQ files for target and control and + the path to Bowtie executables are required. (Default: off) + + --chipseq-peak-file + Full path to a ChIP-seq peak file in ENCODE's narrowPeak, i.e. + BED6+4, format. This file is used when running prior-enhanced RSEM + in the default two-partition model. It partitions isoforms by + whether they have ChIP-seq overlapping with their transcription + start site region or not. Each partition will have its own prior + parameter learned from a training set. This file can be either + gzipped or ungzipped. (Default: "") + + --chipseq-target-read-files + Comma-separated full path of FASTQ read file(s) for ChIP-seq target. + This option is used when running prior-enhanced RSEM. It provides + information to calculate ChIP-seq peaks and signals. The file(s) can + be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. The + options '--bowtie-path ' and '--chipseq-control-read-files + ' must be defined when this option is specified. (Default: + "") + + --chipseq-control-read-files + Comma-separated full path of FASTQ read file(s) for ChIP-seq conrol. + This option is used when running prior-enhanced RSEM. It provides + information to call ChIP-seq peaks. The file(s) can be either + ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options + '--bowtie-path ' and '--chipseq-target-read-files ' + must be defined when this option is specified. (Default: "") + + --chipseq-read-files-multi-targets + Comma-separated full path of FASTQ read files for multiple ChIP-seq + targets. This option is used when running prior-enhanced RSEM, where + prior is learned from multiple complementary data sets. It provides + information to calculate ChIP-seq signals. All files can be either + ungzipped or gzipped with a suffix '.gz' or '.gzip'. When this + option is specified, the option '--bowtie-path ' must be + defined and the option '--partition-model ' will be set to + 'cmb_lgt' automatically. (Default: "") + + --chipseq-bed-files-multi-targets + Comma-separated full path of BED files for multiple ChIP-seq + targets. This option is used when running prior-enhanced RSEM, where + prior is learned from multiple complementary data sets. It provides + information of ChIP-seq signals and must have at least the first six + BED columns. All files can be either ungzipped or gzipped with a + suffix '.gz' or '.gzip'. When this option is specified, the option + '--partition-model ' will be set to 'cmb_lgt' automatically. + (Default: "") + + --cap-stacked-chipseq-reads + Keep a maximum number of ChIP-seq reads that aligned to the same + genomic interval. This option is used when running prior-enhanced + RSEM, where prior is learned from multiple complementary data sets. + This option is only in use when either + '--chipseq-read-files-multi-targets ' or + '--chipseq-bed-files-multi-targets ' is specified. (Default: + off) + + --n-max-stacked-chipseq-reads + The maximum number of stacked ChIP-seq reads to keep. This option is + used when running prior-enhanced RSEM, where prior is learned from + multiple complementary data sets. This option is only in use when + the option '--cap-stacked-chipseq-reads' is set. (Default: 5) + + --partition-model + A keyword to specify the partition model used by prior-enhanced + RSEM. It must be one of the following keywords: + + - pk + Partitioned by whether an isoform has a ChIP-seq peak overlapping + with its transcription start site (TSS) region. The TSS region is + defined as [TSS-500bp, TSS+500bp]. For simplicity, we refer this + type of peak as 'TSS peak' when explaining other keywords. + + - pk_lgtnopk + First partitioned by TSS peak. Then, for isoforms in the 'no TSS + peak' set, a logistic model is employed to further classify them + into two partitions. + + - lm3, lm4, lm5, or lm6 + Based on their ChIP-seq signals, isoforms are classified into 3, + 4, 5, or 6 partitions by a linear regression model. + + - nopk_lm2pk, nopk_lm3pk, nopk_lm4pk, or nopk_lm5pk + First partitioned by TSS peak. Then, for isoforms in the 'with TSS + peak' set, a linear regression model is employed to further + classify them into 2, 3, 4, or 5 partitions. + + - pk_lm2nopk, pk_lm3nopk, pk_lm4nopk, or pk_lm5nopk + First partitioned by TSS peak. Then, for isoforms in the 'no TSS + peak' set, a linear regression model is employed to further + classify them into 2, 3, 4, or 5 partitions. + + - cmb_lgt + Using a logistic regression to combine TSS signals from multiple + complementary data sets and partition training set isoform into + 'expressed' and 'not expressed'. This partition model is only in + use when either '--chipseq-read-files-multi-targets ' or + '--chipseq-bed-files-multi-targets is specified. + + Parameters for all the above models are learned from a training set. + For detailed explanations, please see prior-enhanced RSEM's paper. + (Default: 'pk') + +DEPRECATED OPTIONS + The options in this section are deprecated. They are here only for + compatibility reasons and may be removed in future releases. + + --sam + Inputs are alignments in SAM format. (Default: off) + + --bam + Inputs are alignments in BAM format. (Default: off) + + --strand-specific + Equivalent to '--strandedness forward'. (Default: off) + + --forward-prob + Probability of generating a read from the forward strand of a + transcript. Set to 1 for a strand-specific protocol where all + (upstream) reads are derived from the forward strand, 0 for a + strand-specific protocol where all (upstream) read are derived from + the reverse strand, or 0.5 for a non-strand-specific protocol. + (Default: off) + +DESCRIPTION + In its default mode, this program aligns input reads against a reference + transcriptome with Bowtie and calculates expression values using the + alignments. RSEM assumes the data are single-end reads with quality + scores, unless the '--paired-end' or '--no-qualities' options are + specified. Alternatively, users can use STAR to align reads using the + '--star' option. RSEM has provided options in 'rsem-prepare-reference' + to prepare STAR's genome indices. Users may use an alternative aligner + by specifying '--alignments', and providing an alignment file in + SAM/BAM/CRAM format. However, users should make sure that they align + against the indices generated by 'rsem-prepare-reference' and the + alignment file satisfies the requirements mentioned in ARGUMENTS + section. + + One simple way to make the alignment file satisfying RSEM's requirements + is to use the 'convert-sam-for-rsem' script. This script accepts + SAM/BAM/CRAM files as input and outputs a BAM file. For example, type + the following command to convert a SAM file, 'input.sam', to a + ready-for-use BAM file, 'input_for_rsem.bam': + + convert-sam-for-rsem input.sam input_for_rsem + + For details, please refer to 'convert-sam-for-rsem's documentation page. + +NOTES + 1. Users must run 'rsem-prepare-reference' with the appropriate + reference before using this program. + + 2. For single-end data, it is strongly recommended that the user provide + the fragment length distribution parameters (--fragment-length-mean and + --fragment-length-sd). For paired-end data, RSEM will automatically + learn a fragment length distribution from the data. + + 3. Some aligner parameters have default values different from their + original settings. + + 4. With the '--calc-pme' option, posterior mean estimates will be + calculated in addition to maximum likelihood estimates. + + 5. With the '--calc-ci' option, 95% credibility intervals and posterior + mean estimates will be calculated in addition to maximum likelihood + estimates. + + 6. The temporary directory and all intermediate files will be removed + when RSEM finishes unless '--keep-intermediate-files' is specified. + + With the '--run-pRSEM' option and associated options (see section + 'PRIOR-ENHANCED RSEM OPTIONS' above for details), prior-enhanced RSEM + will be running. Prior parameters will be learned from supplied external + data set(s) and assigned as initial pseudo-counts for isoforms in the + corresponding partition for Gibbs sampling. + +OUTPUT + sample_name.isoforms.results + File containing isoform level expression estimates. The first line + contains column names separated by the tab character. The format of + each line in the rest of this file is: + + transcript_id gene_id length effective_length expected_count TPM + FPKM IsoPct [posterior_mean_count + posterior_standard_deviation_of_count pme_TPM pme_FPKM + IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound + TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound + FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation] + + Fields are separated by the tab character. Fields within "[]" are + optional. They will not be presented if neither '--calc-pme' nor + '--calc-ci' is set. + + 'transcript_id' is the transcript name of this transcript. 'gene_id' + is the gene name of the gene which this transcript belongs to + (denote this gene as its parent gene). If no gene information is + provided, 'gene_id' and 'transcript_id' are the same. + + 'length' is this transcript's sequence length (poly(A) tail is not + counted). 'effective_length' counts only the positions that can + generate a valid fragment. If no poly(A) tail is added, + 'effective_length' is equal to transcript length - mean fragment + length + 1. If one transcript's effective length is less than 1, + this transcript's both effective length and abundance estimates are + set to 0. + + 'expected_count' is the sum of the posterior probability of each + read comes from this transcript over all reads. Because 1) each read + aligning to this transcript has a probability of being generated + from background noise; 2) RSEM may filter some alignable low quality + reads, the sum of expected counts for all transcript are generally + less than the total number of reads aligned. + + 'TPM' stands for Transcripts Per Million. It is a relative measure + of transcript abundance. The sum of all transcripts' TPM is 1 + million. 'FPKM' stands for Fragments Per Kilobase of transcript per + Million mapped reads. It is another relative measure of transcript + abundance. If we define l_bar be the mean transcript length in a + sample, which can be calculated as + + l_bar = \sum_i TPM_i / 10^6 * effective_length_i (i goes through + every transcript), + + the following equation is hold: + + FPKM_i = 10^3 / l_bar * TPM_i. + + We can see that the sum of FPKM is not a constant across samples. + + 'IsoPct' stands for isoform percentage. It is the percentage of this + transcript's abandunce over its parent gene's abandunce. If its + parent gene has only one isoform or the gene information is not + provided, this field will be set to 100. + + 'posterior_mean_count', 'pme_TPM', 'pme_FPKM' are posterior mean + estimates calculated by RSEM's Gibbs sampler. + 'posterior_standard_deviation_of_count' is the posterior standard + deviation of counts. 'IsoPct_from_pme_TPM' is the isoform percentage + calculated from 'pme_TPM' values. + + 'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound' + and 'FPKM_ci_upper_bound' are lower(l) and upper(u) bounds of 95% + credibility intervals for TPM and FPKM values. The bounds are + inclusive (i.e. [l, u]). + + 'TPM_coefficient_of_quartile_variation' and + 'FPKM_coefficient_of_quartile_variation' are coefficients of + quartile variation (CQV) for TPM and FPKM values. CQV is a robust + way of measuring the ratio between the standard deviation and the + mean. It is defined as + + CQV := (Q3 - Q1) / (Q3 + Q1), + + where Q1 and Q3 are the first and third quartiles. + + sample_name.genes.results + File containing gene level expression estimates. The first line + contains column names separated by the tab character. The format of + each line in the rest of this file is: + + gene_id transcript_id(s) length effective_length expected_count TPM + FPKM [posterior_mean_count posterior_standard_deviation_of_count + pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound + TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound + FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation] + + Fields are separated by the tab character. Fields within "[]" are + optional. They will not be presented if neither '--calc-pme' nor + '--calc-ci' is set. + + 'transcript_id(s)' is a comma-separated list of transcript_ids + belonging to this gene. If no gene information is provided, + 'gene_id' and 'transcript_id(s)' are identical (the + 'transcript_id'). + + A gene's 'length' and 'effective_length' are defined as the weighted + average of its transcripts' lengths and effective lengths (weighted + by 'IsoPct'). A gene's abundance estimates are just the sum of its + transcripts' abundance estimates. + + sample_name.alleles.results + Only generated when the RSEM references are built with + allele-specific transcripts. + + This file contains allele level expression estimates for + allele-specific expression calculation. The first line contains + column names separated by the tab character. The format of each line + in the rest of this file is: + + allele_id transcript_id gene_id length effective_length + expected_count TPM FPKM AlleleIsoPct AlleleGenePct + [posterior_mean_count posterior_standard_deviation_of_count pme_TPM + pme_FPKM AlleleIsoPct_from_pme_TPM AlleleGenePct_from_pme_TPM + TPM_ci_lower_bound TPM_ci_upper_bound + TPM_coefficient_of_quartile_variation FPKM_ci_lower_bound + FPKM_ci_upper_bound FPKM_coefficient_of_quartile_variation] + + Fields are separated by the tab character. Fields within "[]" are + optional. They will not be presented if neither '--calc-pme' nor + '--calc-ci' is set. + + 'allele_id' is the allele-specific name of this allele-specific + transcript. + + 'AlleleIsoPct' stands for allele-specific percentage on isoform + level. It is the percentage of this allele-specific transcript's + abundance over its parent transcript's abundance. If its parent + transcript has only one allele variant form, this field will be set + to 100. + + 'AlleleGenePct' stands for allele-specific percentage on gene level. + It is the percentage of this allele-specific transcript's abundance + over its parent gene's abundance. + + 'AlleleIsoPct_from_pme_TPM' and 'AlleleGenePct_from_pme_TPM' have + similar meanings. They are calculated based on posterior mean + estimates. + + Please note that if this file is present, the fields 'length' and + 'effective_length' in 'sample_name.isoforms.results' should be + interpreted similarly as the corresponding definitions in + 'sample_name.genes.results'. + + sample_name.transcript.bam + Only generated when --no-bam-output is not specified. + + 'sample_name.transcript.bam' is a BAM-formatted file of read + alignments in transcript coordinates. The MAPQ field of each + alignment is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), + where w is the posterior probability of that alignment being the + true mapping of a read. In addition, RSEM pads a new tag ZW:f:value, + where value is a single precision floating number representing the + posterior probability. Because this file contains all alignment + lines produced by bowtie or user-specified aligners, it can also be + used as a replacement of the aligner generated BAM/SAM file. + + sample_name.transcript.sorted.bam and + sample_name.transcript.sorted.bam.bai + Only generated when --no-bam-output is not specified and + --sort-bam-by-coordinate is specified. + + 'sample_name.transcript.sorted.bam' and + 'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and + indices generated by samtools (included in RSEM package). + + sample_name.genome.bam + Only generated when --no-bam-output is not specified and + --output-genome-bam is specified. + + 'sample_name.genome.bam' is a BAM-formatted file of read alignments + in genomic coordinates. Alignments of reads that have identical + genomic coordinates (i.e., alignments to different isoforms that + share the same genomic region) are collapsed into one alignment. The + MAPQ field of each alignment is set to min(100, floor(-10 * + log10(1.0 - w) + 0.5)), where w is the posterior probability of that + alignment being the true mapping of a read. In addition, RSEM pads a + new tag ZW:f:value, where value is a single precision floating + number representing the posterior probability. If an alignment is + spliced, a XS:A:value tag is also added, where value is either '+' + or '-' indicating the strand of the transcript it aligns to. + + sample_name.genome.sorted.bam and sample_name.genome.sorted.bam.bai + Only generated when --no-bam-output is not specified, and + --sort-bam-by-coordinate and --output-genome-bam are specified. + + 'sample_name.genome.sorted.bam' and + 'sample_name.genome.sorted.bam.bai' are the sorted BAM file and + indices generated by samtools (included in RSEM package). + + sample_name.time + Only generated when --time is specified. + + It contains time (in seconds) consumed by aligning reads, estimating + expression levels and calculating credibility intervals. + + sample_name.log + Only generated when --alignments is not specified. + + It captures alignment statistics outputted from the user-specified + aligner. + + sample_name.stat + This is a folder instead of a file. All model related statistics are + stored in this folder. Use 'rsem-plot-model' can generate plots + using this folder. + + 'sample_name.stat/sample_name.cnt' contains alignment statistics. + The format and meanings of each field are described in + 'cnt_file_description.txt' under RSEM directory. + + 'sample_name.stat/sample_name.model' stores RNA-Seq model parameters + learned from the data. The format and meanings of each filed of this + file are described in 'model_file_description.txt' under RSEM + directory. + + The following four output files will be generated only by + prior-enhanced RSEM + + - 'sample_name.stat/sample_name_prsem.all_tr_features' + It stores isofrom features for deriving and assigning pRSEM prior. + The first line is a header and the rest is one isoform per line. + The description for each column is: + + * trid: transcript ID from input annotation + + * geneid: gene ID from input anntation + + * chrom: isoform's chromosome name + + * strand: isoform's strand name + + * start: isoform's end with the lowest genomic loci + + * end: isoform's end with the highest genomic loci + + * tss_mpp: average mappability of [TSS-500bp, TSS+500bp], where + TSS is isoform's transcription start site, i.e. 5'-end + + * body_mpp: average mappability of (TSS+500bp, TES-500bp), where + TES is isoform's transcription end site, i.e. 3'-end + + * tes_mpp: average mappability of [TES-500bp, TES+500bp] + + * pme_count: isoform's fragment or read count from RSEM's + posterior mean estimates + + * tss: isoform's TSS loci + + * tss_pk: equal to 1 if isoform's [TSS-500bp, TSS+500bp] region + overlaps with a RNA Pol II peak; 0 otherwise + + * is_training: equal to 1 if isoform is in the training set where + Pol II prior is learned; 0 otherwise + + - 'sample_name.stat/sample_name_prsem.all_tr_prior' + It stores prior parameters for every isoform. This file does not + have a header. Each line contains a prior parameter and an + isoform's transcript ID delimited by ` # `. + + - 'sample_name.stat/sample_name_uniform_prior_1.isoforms.results' + RSEM's posterior mean estimates on the isoform level with an + initial pseudo-count of one for every isoform. It is in the same + format as the 'sample_name.isoforms.results'. + + - 'sample_name.stat/sample_name_uniform_prior_1.genes.results' + RSEM's posterior mean estimates on the gene level with an initial + pseudo-count of one for every isoform. It is in the same format as + the 'sample_name.genes.results'. + + When learning prior from multiple external data sets in + prior-enhanced RSEM, two additional output files will be generated. + + - 'sample_name.stat/sample_name.pval_LL' + It stores a p-value and a log-likelihood. The p-value indicates + whether the combination of multiple complementary data sets is + informative for RNA-seq quantification. The log-likelihood shows + how well pRSEM's Dirichlet-multinomial model fits the read counts + of partitioned training set isoforms. + + - 'sample_name.stat/sample_name.lgt_mdl.RData' + It stores an R object named 'glmmdl', which is a logistic + regression model on the training set isoforms and multiple + external data sets. + + In addition, extra columns will be added to + 'sample_name.stat/all_tr_features' + + * is_expr: equal to 1 if isoform has an abundance >= 1 TPM and a + non-zero read count from RSEM's posterior mean estimates; 0 + otherwise + + * "$external_data_set_basename": log10 of external data's signal at + [TSS-500, TSS+500]. Signal is the number of reads aligned within + that interval and normalized to RPKM by read depth and interval + length. It will be set to -4 if no read aligned to that interval. + + There are multiple columns like this one, where each represents an + external data set. + + * prd_expr_prob: predicted probability from logistic regression + model on whether this isoform is expressed or not. A probability + higher than 0.5 is considered as expressed + + * partition: group index, to which this isoforms is partitioned + + * prior: prior parameter for this isoform + +EXAMPLES + Assume the path to the bowtie executables is in the user's PATH + environment variable. Reference files are under '/ref' with name + 'mouse_125'. + + 1) '/data/mmliver.fq', single-end reads with quality scores. Quality + scores are encoded as for 'GA pipeline version >= 1.3'. We want to use 8 + threads and generate a genome BAM file. In addition, we want to append + gene/transcript names to the result files: + + rsem-calculate-expression --phred64-quals \ + -p 8 \ + --append-names \ + --output-genome-bam \ + /data/mmliver.fq \ + /ref/mouse_125 \ + mmliver_single_quals + + 2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', stranded paired-end + reads with quality scores. Suppose the library is prepared using TruSeq + Stranded Kit, which means the first mate should map to the reverse + strand. Quality scores are in SANGER format. We want to use 8 threads + and do not generate a genome BAM file: + + rsem-calculate-expression -p 8 \ + --paired-end \ + --strandedness reverse \ + /data/mmliver_1.fq \ + /data/mmliver_2.fq \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + 3) '/data/mmliver.fa', single-end reads without quality scores. We want + to use 8 threads: + + rsem-calculate-expression -p 8 \ + --no-qualities \ + /data/mmliver.fa \ + /ref/mouse_125 \ + mmliver_single_without_quals + + 4) Data are the same as 1). This time we assume the bowtie executables + are under '/sw/bowtie'. We want to take a fragment length distribution + into consideration. We set the fragment length mean to 150 and the + standard deviation to 35. In addition to a BAM file, we also want to + generate credibility intervals. We allow RSEM to use 1GB of memory for + CI calculation: + + rsem-calculate-expression --bowtie-path /sw/bowtie \ + --phred64-quals \ + --fragment-length-mean 150.0 \ + --fragment-length-sd 35.0 \ + -p 8 \ + --output-genome-bam \ + --calc-ci \ + --ci-memory 1024 \ + /data/mmliver.fq \ + /ref/mouse_125 \ + mmliver_single_quals + + 5) '/data/mmliver_paired_end_quals.bam', BAM-formatted alignments for + paired-end reads with quality scores. We want to use 8 threads: + + rsem-calculate-expression --paired-end \ + --alignments \ + -p 8 \ + /data/mmliver_paired_end_quals.bam \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + 6) '/data/mmliver_1.fq.gz' and '/data/mmliver_2.fq.gz', paired-end reads + with quality scores and read files are compressed by gzip. We want to + use STAR to aligned reads and assume STAR executable is '/sw/STAR'. + Suppose we want to use 8 threads and do not generate a genome BAM file: + + rsem-calculate-expression --paired-end \ + --star \ + --star-path /sw/STAR \ + --gzipped-read-file \ + --paired-end \ + -p 8 \ + /data/mmliver_1.fq.gz \ + /data/mmliver_2.fq.gz \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + 7) In the above example, suppose we want to run prior-enhanced RSEM + instead. Assuming we want to learn priors from a ChIP-seq peak file + '/data/mmlive.narrowPeak.gz': + + rsem-calculate-expression --star \ + --star-path /sw/STAR \ + --gzipped-read-file \ + --paired-end \ + --calc-pme \ + --run-pRSEM \ + --chipseq-peak-file /data/mmliver.narrowPeak.gz \ + -p 8 \ + /data/mmliver_1.fq.gz \ + /data/mmliver_2.fq.gz \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + 8) Similar to the example in 7), suppose we want to use the partition + model 'pk_lm2nopk' (partitioning isoforms by Pol II TSS peak first and + then partitioning 'no TSS peak' isoforms into two bins by a linear + regression model), and we want to partition isoforms by RNA Pol II's + ChIP-seq read files '/data/mmliver_PolIIRep1.fq.gz' and + '/data/mmliver_PolIIRep2.fq.gz', and the control ChIP-seq read files + '/data/mmliver_ChIPseqCtrl.fq.gz'. Also, assuming Bowtie's executables + are under '/sw/bowtie/': + + rsem-calculate-expression --star \ + --star-path /sw/STAR \ + --gzipped-read-file \ + --paired-end \ + --calc-pme \ + --run-pRSEM \ + --chipseq-target-read-files /data/mmliver_PolIIRep1.fq.gz,/data/mmliver_PolIIRep2.fq.gz \ + --chipseq-control-read-files /data/mmliver_ChIPseqCtrl.fq.gz \ + --partition-model pk_lm2nopk \ + --bowtie-path /sw/bowtie \ + -p 8 \ + /data/mmliver_1.fq.gz \ + /data/mmliver_2.fq.gz \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + 9) Similar to the example in 8), suppose we want to derive prior from + four histone modification ChIP-seq read data sets: + '/data/H3K27Ac.fastq.gz', '/data/H3K4me1.fastq.gz', + '/data/H3K4me2.fastq.gz', and '/data/H3K4me3.fastq.gz'. Also, assuming + Bowtie's executables are under '/sw/bowtie/': + + rsem-calculate-expression --star \ + --star-path /sw/STAR \ + --gzipped-read-file \ + --paired-end \ + --calc-pme \ + --run-pRSEM \ + --partition-model cmb_lgt \ + --chipseq-read-files-multi-targets /data/H3K27Ac.fastq.gz,/data/H3K4me1.fastq.gz,/data/H3K4me2.fastq.gz,/data/H3K4me3.fastq.gz \ + --bowtie-path /sw/bowtie \ + -p 8 \ + /data/mmliver_1.fq.gz \ + /data/mmliver_2.fq.gz \ + /ref/mouse_125 \ + mmliver_paired_end_quals + diff --git a/src/rsem/rsem_calculate_expression/script.sh b/src/rsem/rsem_calculate_expression/script.sh new file mode 100644 index 00000000..e8c6ce5d --- /dev/null +++ b/src/rsem/rsem_calculate_expression/script.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +set -eo pipefail + +function clean_up { + rm -rf "$tmpdir" +} +trap clean_up EXIT + +tmpdir=$(mktemp -d "$meta_temp_dir/$meta_functionality_name-XXXXXXXX") + +if [ "$par_strandedness" == 'forward' ]; then + strandedness='--strandedness forward' +elif [ "$par_strandedness" == 'reverse' ]; then + strandedness="--strandedness reverse" +else + strandedness='' +fi + +IFS=";" read -ra input <<< $par_input + +INDEX=$(find -L $meta_resources_dir/$par_index -name "*.grp" | sed 's/\.grp$//') + +unset_if_false=( par_paired par_quiet par_no_bam_output par_sampling_for_bam par_no_qualities + par_alignments par_bowtie2 par_star par_hisat2_hca par_append_names + par_single_cell_prior par_calc_pme par_calc_ci par_phred64_quals + par_solexa_quals par_star_gzipped_read_file par_star_bzipped_read_file + par_star_output_genome_bam par_estimate_rspd par_keep_intermediate_files + par_time par_run_pRSEM par_cap_stacked_chipseq_reads par_sort_bam_by_read_name ) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +rsem-calculate-expression \ + ${par_quiet:+-q} \ + ${par_no_bam_output:+--no-bam-output} \ + ${par_sampling_for_bam:+--sampling-for-bam} \ + ${par_no_qualities:+--no-qualities} \ + ${par_alignments:+--alignments} \ + ${par_bowtie2:+--bowtie2} \ + ${par_star:+--star} \ + ${par_hisat2_hca:+--hisat2-hca} \ + ${par_append_names:+--append-names} \ + ${par_single_cell_prior:+--single-cell-prior} \ + ${par_calc_pme:+--calc-pme} \ + ${par_calc_ci:+--calc-ci} \ + ${par_phred64_quals:+--phred64-quals} \ + ${par_solexa_quals:+--solexa-quals} \ + ${par_star_gzipped_read_file:+--star-gzipped-read-file} \ + ${par_star_bzipped_read_file:+--star-bzipped-read-file} \ + ${par_star_output_genome_bam:+--star-output-genome-bam} \ + ${par_estimate_rspd:+--estimate-rspd} \ + ${par_keep_intermediate_files:+--keep-intermediate-files} \ + ${par_time:+--time} \ + ${par_run_pRSEM:+--run-pRSEM} \ + ${par_cap_stacked_chipseq_reads:+--cap-stacked-chipseq-reads} \ + ${par_sort_bam_by_read_name:+--sort-bam-by-read-name} \ + ${par_counts_gene:+--counts-gene "$par_counts_gene"} \ + ${par_counts_transcripts:+--counts-transcripts "$par_counts_transcripts"} \ + ${par_stat:+--stat "$par_stat"} \ + ${par_bam_star:+--bam-star "$par_bam_star"} \ + ${par_bam_genome:+--bam-genome "$par_bam_genome"} \ + ${par_bam_transcript:+--bam-transcript "$par_bam_transcript"} \ + ${par_fai:+--fai "$par_fai"} \ + ${par_seed:+--seed "$par_seed"} \ + ${par_seed_length:+--seed-length "$par_seed_length"} \ + ${par_bowtie_n:+--bowtie-n "$par_bowtie_n"} \ + ${par_bowtie_e:+--bowtie-e "$par_bowtie_e"} \ + ${par_bowtie_m:+--bowtie-m "$par_bowtie_m"} \ + ${par_bowtie_chunkmbs:+--bowtie-chunkmbs "$par_bowtie_chunkmbs"} \ + ${par_bowtie2_mismatch_rate:+--bowtie2-mismatch-rate "$par_bowtie2_mismatch_rate"} \ + ${par_bowtie2_k:+--bowtie2-k "$par_bowtie2_k"} \ + ${par_bowtie2_sensitivity_level:+--bowtie2-sensitivity-level "$par_bowtie2_sensitivity_level"} \ + ${par_tag:+--tag "$par_tag"} \ + ${par_fragment_length_min:+--fragment-length-min "$par_fragment_length_min"} \ + ${par_fragment_length_max:+--fragment-length-max "$par_fragment_length_max"} \ + ${par_fragment_length_mean:+--fragment-length-mean "$par_fragment_length_mean"} \ + ${par_fragment_length_sd:+--fragment-length-sd "$par_fragment_length_sd"} \ + ${par_num_rspd_bins:+--num-rspd-bins "$par_num_rspd_bins"} \ + ${par_gibbs_burnin:+--gibbs-burnin "$par_gibbs_burnin"} \ + ${par_gibbs_number_of_samples:+--gibbs-number-of-samples "$par_gibbs_number_of_samples"} \ + ${par_gibbs_sampling_gap:+--gibbs-sampling-gap "$par_gibbs_sampling_gap"} \ + ${par_ci_credibility_level:+--ci-credibility-level "$par_ci_credibility_level"} \ + ${par_ci_number_of_samples_per_count_vector:+--ci-number-of-samples-per-count-vector "$par_ci_number_of_samples_per_count_vector"} \ + ${par_temporary_folder:+--temporary-folder "$par_temporary_folder"} \ + ${par_chipseq_peak_file:+--chipseq-peak-file "$par_chipseq_peak_file"} \ + ${par_chipseq_target_read_files:+--chipseq-target-read-files "$par_chipseq_target_read_files"} \ + ${par_chipseq_control_read_files:+--chipseq-control-read-files "$par_chipseq_control_read_files"} \ + ${par_chipseq_read_files_multi_targets:+--chipseq-read-files-multi-targets "$par_chipseq_read_files_multi_targets"} \ + ${par_chipseq_bed_files_multi_targets:+--chipseq-bed-files-multi-targets "$par_chipseq_bed_files_multi_targets"} \ + ${par_n_max_stacked_chipseq_reads:+--n-max-stacked-chipseq-reads "$par_n_max_stacked_chipseq_reads"} \ + ${par_partition_model:+--partition-model "$par_partition_model"} \ + $strandedness \ + ${par_paired:+--paired-end} \ + ${input[*]} \ + $INDEX \ + $par_id + diff --git a/src/rsem/rsem_calculate_expression/test.sh b/src/rsem/rsem_calculate_expression/test.sh new file mode 100644 index 00000000..c9ede884 --- /dev/null +++ b/src/rsem/rsem_calculate_expression/test.sh @@ -0,0 +1,116 @@ +#!/bin/bash + +echo ">>> Testing $meta_executable" + +test_dir="${meta_resources_dir}/test_data" + +# wget https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq3/reference/rsem.tar.gz +# gunzip -k rsem.tar.gz +# tar -xf rsem.tar +# mv $test_dir/rsem $meta_resources_dir + +echo "> Prepare test data" + +cat > reads_R1.fastq <<'EOF' +@SEQ_ID1 +ACGCTGCCTCATAAGCCTCACACAT ++ +IIIIIIIIIIIIIIIIIIIIIIIII +@SEQ_ID2 +ACCCGCAAGATTAGGCTCCGTACAC ++ +!!!!!!!!!!!!!!!!!!!!!!!!! +EOF + +cat > reads_R2.fastq <<'EOF' +@SEQ_ID1 +ATGTGTGAGGCTTATGAGGCAGCGT ++ +IIIIIIIIIIIIIIIIIIIIIIIII +@SEQ_ID2 +GTGTACGGAGCCTAATCTTGCAGGG ++ +!!!!!!!!!!!!!!!!!!!!!!!!! +EOF + +cat > genome.fasta <<'EOF' +>chr1 +TGGCATGAGCCAACGAACGCTGCCTCATAAGCCTCACACATCCGCGCCTATGTTGTGACTCTCTGTGAGCGTTCGTGGG +GCTCGTCACCACTATGGTTGGCCGGTTAGTAGTGTGACTCCTGGTTTTCTGGAGCTTCTTTAAACCGTAGTCCAGTCAA +TGCGAATGGCACTTCACGACGGACTGTCCTTAGGTGTGAGGCTTATGAGGCACTCAGGGGA +EOF + +cat > genes.gtf <<'EOF' +chr1 example_source gene 0 50 . + . gene_id "gene1"; transcript_id "transcript1"; +chr1 example_source exon 20 40 . + . gene_id "gene1"; transcript_id "transcript1"; +chr1 example_source gene 100 219 . + . gene_id "gene2"; transcript_id "transcript2"; +chr1 example_source exon 191 210 . + . gene_id "gene2"; transcript_id "transcript2"; +EOF + +cat > ref.cnt <<'EOF' +1 0 0 1 +0 0 0 +0 3 +0 1 +Inf 0 +EOF + +cat > ref.genes.results <<'EOF' +gene_id transcript_id(s) length effective_length expected_count TPM FPKM +gene1 transcript1 21.00 21.00 0.00 0.00 0.00 +gene2 transcript2 20.00 20.00 0.00 0.00 0.00 +EOF + +cat > ref.isoforms.results <<'EOF' +transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct +transcript1 gene1 21 21.00 0.00 0.00 0.00 0.00 +transcript2 gene2 20 20.00 0.00 0.00 0.00 0.00 +EOF + + +echo "> Generate index" + +rsem-prepare-reference \ + --gtf "genes.gtf" \ + "genome.fasta" \ + "index" + +mkdir index +mv index.* index/ + +STAR \ + ${meta_cpus:+--runThreadN $meta_cpus} \ + --runMode genomeGenerate \ + --genomeDir "index/" \ + --genomeFastaFiles "genome.fasta" \ + --sjdbGTFfile "genes.gtf" \ + --genomeSAindexNbases 2 + +######################################################################################### + +echo ">>> Test 1: Paired-end reads using STAR to align reads" +"$meta_executable" \ + --star \ + --paired \ + --input "reads_R1.fastq;reads_R2.fastq" \ + --index index \ + --id test \ + --seed 1 \ + --quiet + +echo ">>> Checking whether output exists" +[ ! -f "test.genes.results" ] && echo "Gene level expression counts file does not exist!" && exit 1 +[ ! -s "test.genes.results" ] && echo "Gene level expression counts file is empty!" && exit 1 +[ ! -f "test.isoforms.results" ] && echo "Transcript level expression counts file does not exist!" && exit 1 +[ ! -s "test.isoforms.results" ] && echo "Transcript level expression counts file is empty!" && exit 1 +[ ! -d "test.stat" ] && echo "Stats file does not exist!" && exit 1 + +echo ">>> Check wheter output is correct" +diff ref.genes.results test.genes.results || { echo "Gene level expression counts file is incorrect!"; exit 1; } +diff ref.isoforms.results test.isoforms.results || { echo "Transcript level expression counts file is incorrect!"; exit 1; } +diff ref.cnt test.stat/test.cnt || { echo "Stats file is incorrect!"; exit 1; } + +##################################################################################################### + +echo "All tests succeeded!" +exit 0