From 8acef309bf08b3c99c720b54c2ebdda433bdc710 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 27 Feb 2024 14:29:21 +0100 Subject: [PATCH] Add star_align_reads component (#22) * add star align component * refactor script * refactor variables and script * ensure zcat and bzcat are also installed * change utf8 into ascii * clean up quotes * fix renamed argument * better utf8 to ascii conversion * update format to viash 0.9 * add star version to docker * add TODOs * add newline * wip check contents of star output * remove test data * rename component * remove todo * remove starsolo params * Apply suggestions from code review Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> * add missing slash --------- Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> --- CHANGELOG.md | 2 + .../star_align_reads/argument_groups.yaml | 1088 +++++++++++++++++ src/star/star_align_reads/config.vsh.yaml | 115 ++ src/star/star_align_reads/help.txt | 927 ++++++++++++++ src/star/star_align_reads/script.py | 109 ++ src/star/star_align_reads/test.sh | 173 +++ .../star_align_reads/utils/process_params.R | 189 +++ 7 files changed, 2603 insertions(+) create mode 100644 src/star/star_align_reads/argument_groups.yaml create mode 100644 src/star/star_align_reads/config.vsh.yaml create mode 100644 src/star/star_align_reads/help.txt create mode 100644 src/star/star_align_reads/script.py create mode 100644 src/star/star_align_reads/test.sh create mode 100644 src/star/star_align_reads/utils/process_params.R diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f6fd740..bd6a639a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,8 @@ * `lofreq/indelqual`: Insert indel qualities into BAM file (PR #17). +* `star/star_align_reads`: Align reads to a reference genome (PR #22). + ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/star/star_align_reads/argument_groups.yaml b/src/star/star_align_reads/argument_groups.yaml new file mode 100644 index 00000000..e6a1c874 --- /dev/null +++ b/src/star/star_align_reads/argument_groups.yaml @@ -0,0 +1,1088 @@ +argument_groups: +- name: Run Parameters + arguments: + - name: --runRNGseed + type: integer + description: random number generator seed. + example: 777 +- name: Genome Parameters + arguments: + - name: --genomeDir + type: file + description: path to the directory where genome files are stored (for --runMode + alignReads) or will be generated (for --runMode generateGenome) + example: ./GenomeDir/ + required: yes + - name: --genomeLoad + type: string + description: |- + mode of shared memory usage for the genome files. Only used with --runMode alignReads. + + - LoadAndKeep ... load genome into shared and keep it in memory after run + - LoadAndRemove ... load genome into shared but remove it after run + - LoadAndExit ... load genome into shared memory and exit, keeping the genome in memory for future runs + - Remove ... do not map anything, just remove loaded genome from memory + - NoSharedMemory ... do not use shared memory, each job will have its own private copy of the genome + example: NoSharedMemory + - name: --genomeFastaFiles + type: file + description: |- + path(s) to the fasta files with the genome sequences, separated by spaces. These files should be plain text FASTA files, they *cannot* be zipped. + + Required for the genome generation (--runMode genomeGenerate). Can also be used in the mapping (--runMode alignReads) to add extra (new) sequences to the genome (e.g. spike-ins). + multiple: yes + multiple_sep: ; + - name: --genomeFileSizes + type: integer + description: genome files exact sizes in bytes. Typically, this should not be + defined by the user. + example: 0 + multiple: yes + multiple_sep: ; + - name: --genomeTransformOutput + type: string + description: |- + which output to transform back to original genome + + - SAM ... SAM/BAM alignments + - SJ ... splice junctions (SJ.out.tab) + - Quant ... quantifications (from --quantMode option) + - None ... no transformation of the output + multiple: yes + multiple_sep: ; + - name: --genomeChrSetMitochondrial + type: string + description: names of the mitochondrial chromosomes. Presently only used for STARsolo + statistics output/ + example: + - chrM + - M + - MT + multiple: yes + multiple_sep: ; +- name: Splice Junctions Database + arguments: + - name: --sjdbFileChrStartEnd + type: string + description: path to the files with genomic coordinates (chr start + end strand) for the splice junction introns. Multiple files can be supplied + and will be concatenated. + multiple: yes + multiple_sep: ; + - name: --sjdbGTFfile + type: file + description: path to the GTF file with annotations + - name: --sjdbGTFchrPrefix + type: string + description: prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL + annotations with UCSC genomes) + - name: --sjdbGTFfeatureExon + type: string + description: feature type in GTF file to be used as exons for building transcripts + example: exon + - name: --sjdbGTFtagExonParentTranscript + type: string + description: GTF attribute name for parent transcript ID (default "transcript_id" + works for GTF files) + example: transcript_id + - name: --sjdbGTFtagExonParentGene + type: string + description: GTF attribute name for parent gene ID (default "gene_id" works for + GTF files) + example: gene_id + - name: --sjdbGTFtagExonParentGeneName + type: string + description: GTF attribute name for parent gene name + example: gene_name + multiple: yes + multiple_sep: ; + - name: --sjdbGTFtagExonParentGeneType + type: string + description: GTF attribute name for parent gene type + example: + - gene_type + - gene_biotype + multiple: yes + multiple_sep: ; + - name: --sjdbOverhang + type: integer + description: length of the donor/acceptor sequence on each side of the junctions, + ideally = (mate_length - 1) + example: 100 + - name: --sjdbScore + type: integer + description: extra alignment score for alignments that cross database junctions + example: 2 + - name: --sjdbInsertSave + type: string + description: |- + which files to save when sjdb junctions are inserted on the fly at the mapping step + + - Basic ... only small junction / transcript files + - All ... all files including big Genome, SA and SAindex - this will create a complete genome directory + example: Basic +- name: Variation parameters + arguments: + - name: --varVCFfile + type: string + description: path to the VCF file that contains variation data. The 10th column + should contain the genotype information, e.g. 0/1 +- name: Read Parameters + arguments: + - name: --readFilesType + type: string + description: |- + format of input read files + + - Fastx ... FASTA or FASTQ + - SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand samtools view + - SAM PE ... SAM or BAM paired-end reads; for BAM use --readFilesCommand samtools view + example: Fastx + - name: --readFilesSAMattrKeep + type: string + description: |- + for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL + + - All ... keep all tags + - None ... do not keep any tags + example: All + multiple: yes + multiple_sep: ; + - name: --readFilesManifest + type: file + description: |- + path to the "manifest" file with the names of read files. The manifest file should contain 3 tab-separated columns: + + paired-end reads: read1_file_name $tab$ read2_file_name $tab$ read_group_line. + single-end reads: read1_file_name $tab$ - $tab$ read_group_line. + Spaces, but not tabs are allowed in file names. + If read_group_line does not start with ID:, it can only contain one ID field, and ID: will be added to it. + If read_group_line starts with ID:, it can contain several fields separated by $tab$, and all fields will be be copied verbatim into SAM @RG header line. + - name: --readFilesPrefix + type: string + description: prefix for the read files names, i.e. it will be added in front of + the strings in --readFilesIn + - name: --readFilesCommand + type: string + description: |- + command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout + + For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc. + multiple: yes + multiple_sep: ; + - name: --readMapNumber + type: integer + description: |- + number of reads to map from the beginning of the file + + -1: map all reads + example: -1 + - name: --readMatesLengthsIn + type: string + description: Equal/NotEqual - lengths of names,sequences,qualities for both mates + are the same / not the same. NotEqual is safe in all situations. + example: NotEqual + - name: --readNameSeparator + type: string + description: character(s) separating the part of the read names that will be trimmed + in output (read name after space is always trimmed) + example: / + multiple: yes + multiple_sep: ; + - name: --readQualityScoreBase + type: integer + description: number to be subtracted from the ASCII code to get Phred quality + score + example: 33 +- name: Read Clipping + arguments: + - name: --clipAdapterType + type: string + description: |- + adapter clipping type + + - Hamming ... adapter clipping based on Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp + - CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes Opal package by Martin Sosic: https://github.com/Martinsos/opal + - None ... no adapter clipping, all other clip* parameters are disregarded + example: Hamming + - name: --clip3pNbases + type: integer + description: number(s) of bases to clip from 3p of each mate. If one value is + given, it will be assumed the same for both mates. + example: 0 + multiple: yes + multiple_sep: ; + - name: --clip3pAdapterSeq + type: string + description: |- + adapter sequences to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. + + - polyA ... polyA sequence with the length equal to read length + multiple: yes + multiple_sep: ; + - name: --clip3pAdapterMMp + type: double + description: max proportion of mismatches for 3p adapter clipping for each mate. If + one value is given, it will be assumed the same for both mates. + example: 0.1 + multiple: yes + multiple_sep: ; + - name: --clip3pAfterAdapterNbases + type: integer + description: number of bases to clip from 3p of each mate after the adapter clipping. + If one value is given, it will be assumed the same for both mates. + example: 0 + multiple: yes + multiple_sep: ; + - name: --clip5pNbases + type: integer + description: number(s) of bases to clip from 5p of each mate. If one value is + given, it will be assumed the same for both mates. + example: 0 + multiple: yes + multiple_sep: ; +- name: Limits + arguments: + - name: --limitGenomeGenerateRAM + type: long + description: maximum available RAM (bytes) for genome generation + example: '31000000000' + - name: --limitIObufferSize + type: long + description: max available buffers size (bytes) for input/output, per thread + example: + - 30000000 + - 50000000 + multiple: yes + multiple_sep: ; + - name: --limitOutSAMoneReadBytes + type: long + description: 'max size of the SAM record (bytes) for one read. Recommended value: + >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax' + example: 100000 + - name: --limitOutSJoneRead + type: integer + description: max number of junctions for one read (including all multi-mappers) + example: 1000 + - name: --limitOutSJcollapsed + type: integer + description: max number of collapsed junctions + example: 1000000 + - name: --limitBAMsortRAM + type: long + description: maximum available RAM (bytes) for sorting BAM. If =0, it will be + set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory + option. + example: 0 + - name: --limitSjdbInsertNsj + type: integer + description: maximum number of junctions to be inserted to the genome on the fly + at the mapping stage, including those from annotations and those detected in + the 1st step of the 2-pass run + example: 1000000 + - name: --limitNreadsSoft + type: integer + description: soft limit on the number of reads + example: -1 +- name: 'Output: general' + arguments: + - name: --outTmpKeep + type: string + description: |- + whether to keep the temporary files after STAR runs is finished + + - None ... remove all temporary files + - All ... keep all files + - name: --outStd + type: string + description: |- + which output will be directed to stdout (standard out) + + - Log ... log messages + - SAM ... alignments in SAM format (which normally are output to Aligned.out.sam file), normal standard output will go into Log.std.out + - BAM_Unsorted ... alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted + - BAM_SortedByCoordinate ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype BAM SortedByCoordinate + - BAM_Quant ... alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM + example: Log + - name: --outReadsUnmapped + type: string + description: |- + output of unmapped and partially mapped (i.e. mapped only one mate of a paired end read) reads in separate file(s). + + - None ... no output + - Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2 + - name: --outQSconversionAdd + type: integer + description: add this number to the quality score (e.g. to convert from Illumina + to Sanger, use -31) + example: 0 + - name: --outMultimapperOrder + type: string + description: |- + order of multimapping alignments in the output files + + - Old_2.4 ... quasi-random order used before 2.5.0 + - Random ... random order of alignments for each multi-mapper. Read mates (pairs) are always adjacent, all alignment for each read stay together. This option will become default in the future releases. + example: Old_2.4 +- name: 'Output: SAM and BAM' + arguments: + - name: --outSAMtype + type: string + description: |- + type of SAM/BAM output + + 1st word: + - BAM ... output BAM without sorting + - SAM ... output SAM without sorting + - None ... no SAM/BAM output + 2nd, 3rd: + - Unsorted ... standard unsorted + - SortedByCoordinate ... sorted by coordinate. This option will allocate extra memory for sorting which can be specified by --limitBAMsortRAM. + example: SAM + multiple: yes + multiple_sep: ; + - name: --outSAMmode + type: string + description: |- + mode of SAM output + + - None ... no SAM output + - Full ... full SAM output + - NoQS ... full SAM but without quality scores + example: Full + - name: --outSAMstrandField + type: string + description: |- + Cufflinks-like strand field flag + + - None ... not used + - intronMotif ... strand derived from the intron motif. This option changes the output alignments: reads with inconsistent and/or non-canonical introns are filtered out. + - name: --outSAMattributes + type: string + description: |- + a string of desired SAM attributes, in the order desired for the output SAM. Tags can be listed in any combination/order. + + ***Presets: + - None ... no attributes + - Standard ... NH HI AS nM + - All ... NH HI AS nM NM MD jM jI MC ch + ***Alignment: + - NH ... number of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard SAM tag. + - HI ... multiple alignment index, starts with --outSAMattrIHstart (=1 by default). Standard SAM tag. + - AS ... local alignment score, +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE reads, total score for two mates. Stadnard SAM tag. + - nM ... number of mismatches. For PE reads, sum over two mates. + - NM ... edit distance to the reference (number of mismatched + inserted + deleted bases) for each mate. Standard SAM tag. + - MD ... string encoding mismatched and deleted reference bases (see standard SAM specifications). Standard SAM tag. + - jM ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions database is used, and a junction is annotated, 20 is added to its motif value. + - jI ... start and end of introns for all junctions (1-based). + - XS ... alignment strand according to --outSAMstrandField. + - MC ... mate's CIGAR string. Standard SAM tag. + - ch ... marks all segment of all chimeric alingments for --chimOutType WithinBAM output. + - cN ... number of bases clipped from the read ends: 5' and 3' + ***Variation: + - vA ... variant allele + - vG ... genomic coordinate of the variant overlapped by the read. + - vW ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires --waspOutputMode SAMtag. + - ha ... haplotype (1/2) when mapping to the diploid genome. Requires genome generated with --genomeTransformType Diploid . + ***STARsolo: + - CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs for the solo* demultiplexing. + - GX GN ... gene ID and gene name for unique-gene reads. + - gx gn ... gene IDs and gene names for unique- and multi-gene reads. + - CB UB ... error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM SortedByCoordinate. + - sM ... assessment of CB and UMI. + - sS ... sequence of the entire barcode (CB,UMI,adapter). + - sQ ... quality of the entire barcode. + - sF ... type of feature overlap and number of features for each alignment + ***Unsupported/undocumented: + - rB ... alignment block read/genomic coordinates. + - vR ... read coordinate of the variant. + example: Standard + multiple: yes + multiple_sep: ; + - name: --outSAMattrIHstart + type: integer + description: start value for the IH attribute. 0 may be required by some downstream + software, such as Cufflinks or StringTie. + example: 1 + - name: --outSAMunmapped + type: string + description: |- + output of unmapped reads in the SAM format + + 1st word: + - None ... no output + - Within ... output unmapped reads within the main SAM file (i.e. Aligned.out.sam) + 2nd word: + - KeepPairs ... record unmapped mate for each alignment, and, in case of unsorted output, keep it adjacent to its mapped mate. Only affects multi-mapping reads. + multiple: yes + multiple_sep: ; + - name: --outSAMorder + type: string + description: |- + type of sorting for the SAM output + + Paired: one mate after the other for all paired alignments + PairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files + example: Paired + - name: --outSAMprimaryFlag + type: string + description: |- + which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG + + - OneBestScore ... only one alignment with the best score is primary + - AllBestScore ... all alignments with the best score are primary + example: OneBestScore + - name: --outSAMreadID + type: string + description: |- + read ID record type + + - Standard ... first word (until space) from the FASTx read ID line, removing /1,/2 from the end + - Number ... read number (index) in the FASTx file + example: Standard + - name: --outSAMmapqUnique + type: integer + description: '0 to 255: the MAPQ value for unique mappers' + example: 255 + - name: --outSAMflagOR + type: integer + description: '0 to 65535: sam FLAG will be bitwise OR''d with this value, i.e. + FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by STAR, + and after outSAMflagAND. Can be used to set specific bits that are not set otherwise.' + example: 0 + - name: --outSAMflagAND + type: integer + description: '0 to 65535: sam FLAG will be bitwise AND''d with this value, i.e. + FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by STAR, + but before outSAMflagOR. Can be used to unset specific bits that are not set + otherwise.' + example: 65535 + - name: --outSAMattrRGline + type: string + description: |- + SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:", e.g. --outSAMattrRGline ID:xxx CN:yy "DS:z z z". + + xxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted. + Comma separated RG lines correspons to different (comma separated) input files in --readFilesIn. Commas have to be surrounded by spaces, e.g. + --outSAMattrRGline ID:xxx , ID:zzz "DS:z z" , ID:yyy DS:yyyy + multiple: yes + multiple_sep: ; + - name: --outSAMheaderHD + type: string + description: '@HD (header) line of the SAM header' + multiple: yes + multiple_sep: ; + - name: --outSAMheaderPG + type: string + description: extra @PG (software) line of the SAM header (in addition to STAR) + multiple: yes + multiple_sep: ; + - name: --outSAMheaderCommentFile + type: string + description: path to the file with @CO (comment) lines of the SAM header + - name: --outSAMfilter + type: string + description: |- + filter the output into main SAM/BAM files + + - KeepOnlyAddedReferences ... only keep the reads for which all alignments are to the extra reference sequences added with --genomeFastaFiles at the mapping stage. + - KeepAllAddedReferences ... keep all alignments to the extra reference sequences added with --genomeFastaFiles at the mapping stage. + multiple: yes + multiple_sep: ; + - name: --outSAMmultNmax + type: integer + description: |- + max number of multiple alignments for a read that will be output to the SAM/BAM files. Note that if this value is not equal to -1, the top scoring alignment will be output first + + - -1 ... all alignments (up to --outFilterMultimapNmax) will be output + example: -1 + - name: --outSAMtlen + type: integer + description: |- + calculation method for the TLEN field in the SAM/BAM files + + - 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate. (+)sign for the (+)strand mate + - 2 ... leftmost base of any mate to rightmost base of any mate. (+)sign for the mate with the leftmost base. This is different from 1 for overlapping mates with protruding ends + example: 1 + - name: --outBAMcompression + type: integer + description: -1 to 10 BAM compression level, -1=default compression (6?), 0=no + compression, 10=maximum compression + example: 1 + - name: --outBAMsortingThreadN + type: integer + description: '>=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN).' + example: 0 + - name: --outBAMsortingBinsN + type: integer + description: '>0: number of genome bins for coordinate-sorting' + example: 50 +- name: BAM processing + arguments: + - name: --bamRemoveDuplicatesType + type: string + description: |- + mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only + + - - ... no duplicate removal/marking + - UniqueIdentical ... mark all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical + - UniqueIdenticalNotMulti ... mark duplicate unique mappers but not multimappers. + - name: --bamRemoveDuplicatesMate2basesN + type: integer + description: number of bases from the 5' of mate 2 to use in collapsing (e.g. + for RAMPAGE) + example: 0 +- name: Output Wiggle + arguments: + - name: --outWigType + type: string + description: |- + type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p". Requires sorted BAM: --outSAMtype BAM SortedByCoordinate . + + 1st word: + - None ... no signal output + - bedGraph ... bedGraph format + - wiggle ... wiggle format + 2nd word: + - read1_5p ... signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc + - read2 ... signal from only 2nd read + multiple: yes + multiple_sep: ; + - name: --outWigStrand + type: string + description: |- + strandedness of wiggle/bedGraph output + + - Stranded ... separate strands, str1 and str2 + - Unstranded ... collapsed strands + example: Stranded + - name: --outWigReferencesPrefix + type: string + description: prefix matching reference names to include in the output wiggle file, + e.g. "chr", default "-" - include all references + - name: --outWigNorm + type: string + description: |- + type of normalization for the signal + + - RPM ... reads per million of mapped reads + - None ... no normalization, "raw" counts + example: RPM +- name: Output Filtering + arguments: + - name: --outFilterType + type: string + description: |- + type of filtering + + - Normal ... standard filtering using only current alignment + - BySJout ... keep only those reads that contain junctions that passed filtering into SJ.out.tab + example: Normal + - name: --outFilterMultimapScoreRange + type: integer + description: the score range below the maximum score for multimapping alignments + example: 1 + - name: --outFilterMultimapNmax + type: integer + description: |- + maximum number of loci the read is allowed to map to. Alignments (all of them) will be output only if the read maps to no more loci than this value. + + Otherwise no alignments will be output, and the read will be counted as "mapped to too many loci" in the Log.final.out . + example: 10 + - name: --outFilterMismatchNmax + type: integer + description: alignment will be output only if it has no more mismatches than this + value. + example: 10 + - name: --outFilterMismatchNoverLmax + type: double + description: alignment will be output only if its ratio of mismatches to *mapped* + length is less than or equal to this value. + example: 0.3 + - name: --outFilterMismatchNoverReadLmax + type: double + description: alignment will be output only if its ratio of mismatches to *read* + length is less than or equal to this value. + example: 1.0 + - name: --outFilterScoreMin + type: integer + description: alignment will be output only if its score is higher than or equal + to this value. + example: 0 + - name: --outFilterScoreMinOverLread + type: double + description: same as outFilterScoreMin, but normalized to read length (sum of + mates' lengths for paired-end reads) + example: 0.66 + - name: --outFilterMatchNmin + type: integer + description: alignment will be output only if the number of matched bases is higher + than or equal to this value. + example: 0 + - name: --outFilterMatchNminOverLread + type: double + description: sam as outFilterMatchNmin, but normalized to the read length (sum + of mates' lengths for paired-end reads). + example: 0.66 + - name: --outFilterIntronMotifs + type: string + description: |- + filter alignment using their motifs + + - None ... no filtering + - RemoveNoncanonical ... filter out alignments that contain non-canonical junctions + - RemoveNoncanonicalUnannotated ... filter out alignments that contain non-canonical unannotated junctions when using annotated splice junctions database. The annotated non-canonical junctions will be kept. + - name: --outFilterIntronStrands + type: string + description: |- + filter alignments + + - RemoveInconsistentStrands ... remove alignments that have junctions with inconsistent strands + - None ... no filtering + example: RemoveInconsistentStrands +- name: Output splice junctions (SJ.out.tab) + arguments: + - name: --outSJtype + type: string + description: |- + type of splice junction output + + - Standard ... standard SJ.out.tab output + - None ... no splice junction output + example: Standard +- name: 'Output Filtering: Splice Junctions' + arguments: + - name: --outSJfilterReads + type: string + description: |- + which reads to consider for collapsed splice junctions output + + - All ... all reads, unique- and multi-mappers + - Unique ... uniquely mapping reads only + example: All + - name: --outSJfilterOverhangMin + type: integer + description: |- + minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + + does not apply to annotated junctions + example: + - 30 + - 12 + - 12 + - 12 + multiple: yes + multiple_sep: ; + - name: --outSJfilterCountUniqueMin + type: integer + description: |- + minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + + Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied + does not apply to annotated junctions + example: + - 3 + - 1 + - 1 + - 1 + multiple: yes + multiple_sep: ; + - name: --outSJfilterCountTotalMin + type: integer + description: |- + minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + + Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied + does not apply to annotated junctions + example: + - 3 + - 1 + - 1 + - 1 + multiple: yes + multiple_sep: ; + - name: --outSJfilterDistToOtherSJmin + type: integer + description: |- + minimum allowed distance to other junctions' donor/acceptor + + does not apply to annotated junctions + example: + - 10 + - 0 + - 5 + - 10 + multiple: yes + multiple_sep: ; + - name: --outSJfilterIntronMaxVsReadN + type: integer + description: |- + maximum gap allowed for junctions supported by 1,2,3,,,N reads + + i.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax + does not apply to annotated junctions + example: + - 50000 + - 100000 + - 200000 + multiple: yes + multiple_sep: ; +- name: Scoring + arguments: + - name: --scoreGap + type: integer + description: splice junction penalty (independent on intron motif) + example: 0 + - name: --scoreGapNoncan + type: integer + description: non-canonical junction penalty (in addition to scoreGap) + example: -8 + - name: --scoreGapGCAG + type: integer + description: GC/AG and CT/GC junction penalty (in addition to scoreGap) + example: -4 + - name: --scoreGapATAC + type: integer + description: AT/AC and GT/AT junction penalty (in addition to scoreGap) + example: -8 + - name: --scoreGenomicLengthLog2scale + type: integer + description: 'extra score logarithmically scaled with genomic length of the alignment: + scoreGenomicLengthLog2scale*log2(genomicLength)' + example: 0 + - name: --scoreDelOpen + type: integer + description: deletion open penalty + example: -2 + - name: --scoreDelBase + type: integer + description: deletion extension penalty per base (in addition to scoreDelOpen) + example: -2 + - name: --scoreInsOpen + type: integer + description: insertion open penalty + example: -2 + - name: --scoreInsBase + type: integer + description: insertion extension penalty per base (in addition to scoreInsOpen) + example: -2 + - name: --scoreStitchSJshift + type: integer + description: maximum score reduction while searching for SJ boundaries in the + stitching step + example: 1 +- name: Alignments and Seeding + arguments: + - name: --seedSearchStartLmax + type: integer + description: defines the search start point through the read - the read is split + into pieces no longer than this value + example: 50 + - name: --seedSearchStartLmaxOverLread + type: double + description: seedSearchStartLmax normalized to read length (sum of mates' lengths + for paired-end reads) + example: 1.0 + - name: --seedSearchLmax + type: integer + description: defines the maximum length of the seeds, if =0 seed length is not + limited + example: 0 + - name: --seedMultimapNmax + type: integer + description: only pieces that map fewer than this value are utilized in the stitching + procedure + example: 10000 + - name: --seedPerReadNmax + type: integer + description: max number of seeds per read + example: 1000 + - name: --seedPerWindowNmax + type: integer + description: max number of seeds per window + example: 50 + - name: --seedNoneLociPerWindow + type: integer + description: max number of one seed loci per window + example: 10 + - name: --seedSplitMin + type: integer + description: min length of the seed sequences split by Ns or mate gap + example: 12 + - name: --seedMapMin + type: integer + description: min length of seeds to be mapped + example: 5 + - name: --alignIntronMin + type: integer + description: minimum intron size, genomic gap is considered intron if its length>=alignIntronMin, + otherwise it is considered Deletion + example: 21 + - name: --alignIntronMax + type: integer + description: maximum intron size, if 0, max intron size will be determined by + (2^winBinNbits)*winAnchorDistNbins + example: 0 + - name: --alignMatesGapMax + type: integer + description: maximum gap between two mates, if 0, max intron gap will be determined + by (2^winBinNbits)*winAnchorDistNbins + example: 0 + - name: --alignSJoverhangMin + type: integer + description: minimum overhang (i.e. block size) for spliced alignments + example: 5 + - name: --alignSJstitchMismatchNmax + type: integer + description: |- + maximum number of mismatches for stitching of the splice junctions (-1: no limit). + + (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. + example: + - 0 + - -1 + - 0 + - 0 + multiple: yes + multiple_sep: ; + - name: --alignSJDBoverhangMin + type: integer + description: minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments + example: 3 + - name: --alignSplicedMateMapLmin + type: integer + description: minimum mapped length for a read mate that is spliced + example: 0 + - name: --alignSplicedMateMapLminOverLmate + type: double + description: alignSplicedMateMapLmin normalized to mate length + example: 0.66 + - name: --alignWindowsPerReadNmax + type: integer + description: max number of windows per read + example: 10000 + - name: --alignTranscriptsPerWindowNmax + type: integer + description: max number of transcripts per window + example: 100 + - name: --alignTranscriptsPerReadNmax + type: integer + description: max number of different alignments per read to consider + example: 10000 + - name: --alignEndsType + type: string + description: |- + type of read ends alignment + + - Local ... standard local alignment with soft-clipping allowed + - EndToEnd ... force end-to-end read alignment, do not soft-clip + - Extend5pOfRead1 ... fully extend only the 5p of the read1, all other ends: local alignment + - Extend5pOfReads12 ... fully extend only the 5p of the both read1 and read2, all other ends: local alignment + example: Local + - name: --alignEndsProtrude + type: string + description: |- + allow protrusion of alignment ends, i.e. start (end) of the +strand mate downstream of the start (end) of the -strand mate + + 1st word: int: maximum number of protrusion bases allowed + 2nd word: string: + - ConcordantPair ... report alignments with non-zero protrusion as concordant pairs + - DiscordantPair ... report alignments with non-zero protrusion as discordant pairs + example: 0 ConcordantPair + - name: --alignSoftClipAtReferenceEnds + type: string + description: |- + allow the soft-clipping of the alignments past the end of the chromosomes + + - Yes ... allow + - No ... prohibit, useful for compatibility with Cufflinks + example: 'Yes' + - name: --alignInsertionFlush + type: string + description: |- + how to flush ambiguous insertion positions + + - None ... insertions are not flushed + - Right ... insertions are flushed to the right +- name: Paired-End reads + arguments: + - name: --peOverlapNbasesMin + type: integer + description: minimum number of overlapping bases to trigger mates merging and + realignment. Specify >0 value to switch on the "merginf of overlapping mates" + algorithm. + example: 0 + - name: --peOverlapMMp + type: double + description: maximum proportion of mismatched bases in the overlap area + example: 0.01 +- name: Windows, Anchors, Binning + arguments: + - name: --winAnchorMultimapNmax + type: integer + description: max number of loci anchors are allowed to map to + example: 50 + - name: --winBinNbits + type: integer + description: =log2(winBin), where winBin is the size of the bin for the windows/clustering, + each window will occupy an integer number of bins. + example: 16 + - name: --winAnchorDistNbins + type: integer + description: max number of bins between two anchors that allows aggregation of + anchors into one window + example: 9 + - name: --winFlankNbins + type: integer + description: log2(winFlank), where win Flank is the size of the left and right + flanking regions for each window + example: 4 + - name: --winReadCoverageRelativeMin + type: double + description: minimum relative coverage of the read sequence by the seeds in a + window, for STARlong algorithm only. + example: 0.5 + - name: --winReadCoverageBasesMin + type: integer + description: minimum number of bases covered by the seeds in a window , for STARlong + algorithm only. + example: 0 +- name: Chimeric Alignments + arguments: + - name: --chimOutType + type: string + description: |- + type of chimeric output + + - Junctions ... Chimeric.out.junction + - SeparateSAMold ... output old SAM into separate Chimeric.out.sam file + - WithinBAM ... output into main aligned BAM files (Aligned.*.bam) + - WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental chimeric alignments (default if no 2nd word is present) + - WithinBAM SoftClip ... soft-clipping in the CIGAR for supplemental chimeric alignments + example: Junctions + multiple: yes + multiple_sep: ; + - name: --chimSegmentMin + type: integer + description: minimum length of chimeric segment length, if ==0, no chimeric output + example: 0 + - name: --chimScoreMin + type: integer + description: minimum total (summed) score of the chimeric segments + example: 0 + - name: --chimScoreDropMax + type: integer + description: max drop (difference) of chimeric score (the sum of scores of all + chimeric segments) from the read length + example: 20 + - name: --chimScoreSeparation + type: integer + description: minimum difference (separation) between the best chimeric score and + the next one + example: 10 + - name: --chimScoreJunctionNonGTAG + type: integer + description: penalty for a non-GT/AG chimeric junction + example: -1 + - name: --chimJunctionOverhangMin + type: integer + description: minimum overhang for a chimeric junction + example: 20 + - name: --chimSegmentReadGapMax + type: integer + description: maximum gap in the read sequence between chimeric segments + example: 0 + - name: --chimFilter + type: string + description: |- + different filters for chimeric alignments + + - None ... no filtering + - banGenomicN ... Ns are not allowed in the genome sequence around the chimeric junction + example: banGenomicN + multiple: yes + multiple_sep: ; + - name: --chimMainSegmentMultNmax + type: integer + description: maximum number of multi-alignments for the main chimeric segment. + =1 will prohibit multimapping main segments. + example: 10 + - name: --chimMultimapNmax + type: integer + description: |- + maximum number of chimeric multi-alignments + + - 0 ... use the old scheme for chimeric detection which only considered unique alignments + example: 0 + - name: --chimMultimapScoreRange + type: integer + description: the score range for multi-mapping chimeras below the best chimeric + score. Only works with --chimMultimapNmax > 1 + example: 1 + - name: --chimNonchimScoreDropMin + type: integer + description: to trigger chimeric detection, the drop in the best non-chimeric + alignment score with respect to the read length has to be greater than this + value + example: 20 + - name: --chimOutJunctionFormat + type: integer + description: |- + formatting type for the Chimeric.out.junction file + + - 0 ... no comment lines/headers + - 1 ... comment lines at the end of the file: command line and Nreads: total, unique/multi-mapping + example: 0 +- name: Quantification of Annotations + arguments: + - name: --quantMode + type: string + description: |- + types of quantification requested + + - - ... none + - TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate file + - GeneCounts ... count reads per gene + multiple: yes + multiple_sep: ; + - name: --quantTranscriptomeBAMcompression + type: integer + description: |- + -2 to 10 transcriptome BAM compression level + + - -2 ... no BAM output + - -1 ... default compression (6?) + - 0 ... no compression + - 10 ... maximum compression + example: 1 + - name: --quantTranscriptomeSAMoutput + type: string + description: |- + alignment filtering for TranscriptomeSAM output + + - BanSingleEnd_BanIndels_ExtendSoftclip ... prohibit indels and single-end alignments, extend softclips - compatible with RSEM + - BanSingleEnd ... prohibit single-end alignments, allow indels and softclips + - BanSingleEnd_ExtendSoftclip ... prohibit single-end alignments, extend softclips, allow indels + example: BanSingleEnd_BanIndels_ExtendSoftclip +- name: 2-pass Mapping + arguments: + - name: --twopassMode + type: string + description: |- + 2-pass mapping mode. + + - None ... 1-pass mapping + - Basic ... basic 2-pass mapping, with all 1st pass junctions inserted into the genome indices on the fly + - name: --twopass1readsN + type: integer + description: number of reads to process for the 1st step. Use very large number + (or default -1) to map all reads in the first step. + example: -1 +- name: WASP parameters + arguments: + - name: --waspOutputMode + type: string + description: |- + WASP allele-specific output type. This is re-implementation of the original WASP mappability filtering by Bryce van de Geijn, Graham McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12, 1061-1063 (2015), https://www.nature.com/articles/nmeth.3582 . + + - SAMtag ... add WASP tags to the alignments that pass WASP filtering diff --git a/src/star/star_align_reads/config.vsh.yaml b/src/star/star_align_reads/config.vsh.yaml new file mode 100644 index 00000000..8fdd5256 --- /dev/null +++ b/src/star/star_align_reads/config.vsh.yaml @@ -0,0 +1,115 @@ +name: star_align_reads +namespace: star +description: | + Aligns reads to a reference genome using STAR. +keywords: [align, fasta, genome] +links: + repository: https://github.com/alexdobin/STAR + documentation: https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf +references: + doi: 10.1093/bioinformatics/bts635 +license: MIT +requirements: + commands: [ STAR, python, ps, zcat, bzcat ] +# manually taking care of the main input and output arguments +argument_groups: + - name: Inputs + arguments: + - type: file + name: --input + alternatives: --readFilesIn + required: true + description: The single-end or paired-end R1 FastQ files to be processed. + example: [ mysample_S1_L001_R1_001.fastq.gz ] + multiple: true + - type: file + name: --input_r2 + required: false + description: The paired-end R2 FastQ files to be processed. Only required if --input is a paired-end R1 file. + example: [ mysample_S1_L001_R2_001.fastq.gz ] + multiple: true + - name: Outputs + arguments: + - type: file + name: --aligned_reads + required: true + description: The output file containing the aligned reads. + direction: output + example: aligned_reads.bam + - type: file + name: --reads_per_gene + required: false + description: The output file containing the number of reads per gene. + direction: output + example: reads_per_gene.tsv + - type: file + name: --unmapped + required: false + description: The output file containing the unmapped reads. + direction: output + example: unmapped.fastq + - type: file + name: --unmapped_r2 + required: false + description: The output file containing the unmapped R2 reads. + direction: output + example: unmapped_r2.fastq + - type: file + name: --chimeric_junctions + required: false + description: The output file containing the chimeric junctions. + direction: output + example: chimeric_junctions.tsv + - type: file + name: --log + required: false + description: The output file containing the log of the alignment process. + direction: output + example: log.txt + - type: file + name: --splice_junctions + required: false + description: The output file containing the splice junctions. + direction: output + example: splice_junctions.tsv +# other arguments are defined in a separate file +__merge__: argument_groups.yaml +resources: + - type: python_script + path: script.py +test_resources: + - type: bash_script + path: test.sh +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - gzip + - bzip2 + # setup derived from https://github.com/alexdobin/STAR/blob/master/extras/docker/Dockerfile + - type: docker + env: + - STAR_VERSION 2.7.11b + - PACKAGES gcc g++ make wget zlib1g-dev unzip xxd + run: | + apt-get update && \ + apt-get install -y --no-install-recommends ${PACKAGES} && \ + cd /tmp && \ + wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip && \ + unzip ${STAR_VERSION}.zip && \ + cd STAR-${STAR_VERSION}/source && \ + make STARstatic CXXFLAGS_SIMD=-std=c++11 && \ + cp STAR /usr/local/bin && \ + cd / && \ + rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip && \ + apt-get --purge autoremove -y ${PACKAGES} && \ + apt-get clean + - type: docker + run: | + STAR --version | sed 's#\(.*\)#star: "\1"#' > /var/software_versions.txt +runners: + - type: executable + - type: nextflow diff --git a/src/star/star_align_reads/help.txt b/src/star/star_align_reads/help.txt new file mode 100644 index 00000000..940f639d --- /dev/null +++ b/src/star/star_align_reads/help.txt @@ -0,0 +1,927 @@ +Usage: STAR [options]... --genomeDir /path/to/genome/index/ --readFilesIn R1.fq R2.fq +Spliced Transcripts Alignment to a Reference (c) Alexander Dobin, 2009-2022 + +STAR version=2.7.11b +STAR compilation time,server,dir=2024-02-11T19:36:26+00:00 :/tmp/STAR-2.7.11b/source +For more details see: + + +### versions +versionGenome 2.7.4a + string: earliest genome index version compatible with this STAR release. Please do not change this value! + +### Parameter Files +parametersFiles - + string: name of a user-defined parameters file, "-": none. Can only be defined on the command line. + +### System +sysShell - + string: path to the shell binary, preferably bash, e.g. /bin/bash. + - ... the default shell is executed, typically /bin/sh. This was reported to fail on some Ubuntu systems - then you need to specify path to bash. + +### Run Parameters +runMode alignReads + string: type of the run. + alignReads ... map reads + genomeGenerate ... generate genome files + inputAlignmentsFromBAM ... input alignments from BAM. Presently only works with --outWigType and --bamRemoveDuplicates options. + liftOver ... lift-over of GTF files (--sjdbGTFfile) between genome assemblies using chain file(s) from --genomeChainFiles. + soloCellFiltering ... STARsolo cell filtering ("calling") without remapping, followed by the path to raw count directory and output (filtered) prefix + +runThreadN 1 + int: number of threads to run STAR + +runDirPerm User_RWX + string: permissions for the directories created at the run-time. + User_RWX ... user-read/write/execute + All_RWX ... all-read/write/execute (same as chmod 777) + +runRNGseed 777 + int: random number generator seed. + + +### Genome Parameters +genomeDir ./GenomeDir/ + string: path to the directory where genome files are stored (for --runMode alignReads) or will be generated (for --runMode generateGenome) + +genomeLoad NoSharedMemory + string: mode of shared memory usage for the genome files. Only used with --runMode alignReads. + LoadAndKeep ... load genome into shared and keep it in memory after run + LoadAndRemove ... load genome into shared but remove it after run + LoadAndExit ... load genome into shared memory and exit, keeping the genome in memory for future runs + Remove ... do not map anything, just remove loaded genome from memory + NoSharedMemory ... do not use shared memory, each job will have its own private copy of the genome + +genomeFastaFiles - + string(s): path(s) to the fasta files with the genome sequences, separated by spaces. These files should be plain text FASTA files, they *cannot* be zipped. + Required for the genome generation (--runMode genomeGenerate). Can also be used in the mapping (--runMode alignReads) to add extra (new) sequences to the genome (e.g. spike-ins). + +genomeChainFiles - + string: chain files for genomic liftover. Only used with --runMode liftOver . + +genomeFileSizes 0 + uint(s)>0: genome files exact sizes in bytes. Typically, this should not be defined by the user. + +genomeTransformOutput None + string(s): which output to transform back to original genome + SAM ... SAM/BAM alignments + SJ ... splice junctions (SJ.out.tab) + Quant ... quantifications (from --quantMode option) + None ... no transformation of the output + +genomeChrSetMitochondrial chrM M MT + string(s): names of the mitochondrial chromosomes. Presently only used for STARsolo statistics output/ + +### Genome Indexing Parameters - only used with --runMode genomeGenerate +genomeChrBinNbits 18 + int: =log2(chrBin), where chrBin is the size of the bins for genome storage: each chromosome will occupy an integer number of bins. For a genome with large number of contigs, it is recommended to scale this parameter as min(18, log2[max(GenomeLength/NumberOfReferences,ReadLength)]). + +genomeSAindexNbases 14 + int: length (bases) of the SA pre-indexing string. Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, the parameter --genomeSAindexNbases must be scaled down to min(14, log2(GenomeLength)/2 - 1). + +genomeSAsparseD 1 + int>0: suffux array sparsity, i.e. distance between indices: use bigger numbers to decrease needed RAM at the cost of mapping speed reduction + +genomeSuffixLengthMax -1 + int: maximum length of the suffixes, has to be longer than read length. -1 = infinite. + +genomeTransformType None + string: type of genome transformation + None ... no transformation + Haploid ... replace reference alleles with alternative alleles from VCF file (e.g. consensus allele) + Diploid ... create two haplotypes for each chromosome listed in VCF file, for genotypes 1|2, assumes perfect phasing (e.g. personal genome) + +genomeTransformVCF - + string: path to VCF file for genome transformation + + + +#####UnderDevelopment_begin : not supported - do not use +genomeType Full + string: type of genome to generate + Full ... full (normal) genome + Transcriptome ... genome consists of transcript sequences + SuperTransriptome ... genome consists of superTranscript sequences +#####UnderDevelopment_end + +# DEPRECATED: please use --genomeTransformVCF and --genomeTransformType options instead. +#genomeConsensusFile - +# string: VCF file with consensus SNPs (i.e. alternative allele is the major (AF>0.5) allele) +# DEPRECATED + + + +### Splice Junctions Database +sjdbFileChrStartEnd - + string(s): path to the files with genomic coordinates (chr start end strand) for the splice junction introns. Multiple files can be supplied and will be concatenated. + +sjdbGTFfile - + string: path to the GTF file with annotations + +sjdbGTFchrPrefix - + string: prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL annotations with UCSC genomes) + +sjdbGTFfeatureExon exon + string: feature type in GTF file to be used as exons for building transcripts + +sjdbGTFtagExonParentTranscript transcript_id + string: GTF attribute name for parent transcript ID (default "transcript_id" works for GTF files) + +sjdbGTFtagExonParentGene gene_id + string: GTF attribute name for parent gene ID (default "gene_id" works for GTF files) + +sjdbGTFtagExonParentGeneName gene_name + string(s): GTF attribute name for parent gene name + +sjdbGTFtagExonParentGeneType gene_type gene_biotype + string(s): GTF attribute name for parent gene type + +sjdbOverhang 100 + int>0: length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1) + +sjdbScore 2 + int: extra alignment score for alignments that cross database junctions + +sjdbInsertSave Basic + string: which files to save when sjdb junctions are inserted on the fly at the mapping step + Basic ... only small junction / transcript files + All ... all files including big Genome, SA and SAindex - this will create a complete genome directory + +### Variation parameters +varVCFfile - + string: path to the VCF file that contains variation data. The 10th column should contain the genotype information, e.g. 0/1 + +### Input Files +inputBAMfile - + string: path to BAM input file, to be used with --runMode inputAlignmentsFromBAM + +### Read Parameters +readFilesType Fastx + string: format of input read files + Fastx ... FASTA or FASTQ + SAM SE ... SAM or BAM single-end reads; for BAM use --readFilesCommand samtools view + SAM PE ... SAM or BAM paired-end reads; for BAM use --readFilesCommand samtools view + +readFilesSAMattrKeep All + string(s): for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL + All ... keep all tags + None ... do not keep any tags + +readFilesIn Read1 Read2 + string(s): paths to files that contain input read1 (and, if needed, read2) + +readFilesManifest - + string: path to the "manifest" file with the names of read files. The manifest file should contain 3 tab-separated columns: + paired-end reads: read1_file_name $tab$ read2_file_name $tab$ read_group_line. + single-end reads: read1_file_name $tab$ - $tab$ read_group_line. + Spaces, but not tabs are allowed in file names. + If read_group_line does not start with ID:, it can only contain one ID field, and ID: will be added to it. + If read_group_line starts with ID:, it can contain several fields separated by $tab$, and all fields will be be copied verbatim into SAM @RG header line. + +readFilesPrefix - + string: prefix for the read files names, i.e. it will be added in front of the strings in --readFilesIn + +readFilesCommand - + string(s): command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout + For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc. + +readMapNumber -1 + int: number of reads to map from the beginning of the file + -1: map all reads + +readMatesLengthsIn NotEqual + string: Equal/NotEqual - lengths of names,sequences,qualities for both mates are the same / not the same. NotEqual is safe in all situations. + +readNameSeparator / + string(s): character(s) separating the part of the read names that will be trimmed in output (read name after space is always trimmed) + +readQualityScoreBase 33 + int>=0: number to be subtracted from the ASCII code to get Phred quality score + +### Read Clipping + +clipAdapterType Hamming + string: adapter clipping type + Hamming ... adapter clipping based on Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp + CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes Opal package by Martin Šošić: https://github.com/Martinsos/opal + None ... no adapter clipping, all other clip* parameters are disregarded + +clip3pNbases 0 + int(s): number(s) of bases to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. + +clip3pAdapterSeq - + string(s): adapter sequences to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates. + polyA ... polyA sequence with the length equal to read length + +clip3pAdapterMMp 0.1 + double(s): max proportion of mismatches for 3p adapter clipping for each mate. If one value is given, it will be assumed the same for both mates. + +clip3pAfterAdapterNbases 0 + int(s): number of bases to clip from 3p of each mate after the adapter clipping. If one value is given, it will be assumed the same for both mates. + +clip5pNbases 0 + int(s): number(s) of bases to clip from 5p of each mate. If one value is given, it will be assumed the same for both mates. + +#####UnderDevelopment_begin : not supported - do not use +clip5pAdapterSeq - + string(s): adapter sequences to clip from 5p of each mate, separated by space. + +clip5pAdapterMMp 0.1 + double(s): max proportion of mismatches for 5p adapter clipping for each mate, separated by space + +clip5pAfterAdapterNbases 0 + int(s): number of bases to clip from 5p of each mate after the adapter clipping, separated by space. +#####UnderDevelopment_end + +### Limits +limitGenomeGenerateRAM 31000000000 + int>0: maximum available RAM (bytes) for genome generation + +limitIObufferSize 30000000 50000000 + int(s)>0: max available buffers size (bytes) for input/output, per thread + +limitOutSAMoneReadBytes 100000 + int>0: max size of the SAM record (bytes) for one read. Recommended value: >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax + +limitOutSJoneRead 1000 + int>0: max number of junctions for one read (including all multi-mappers) + +limitOutSJcollapsed 1000000 + int>0: max number of collapsed junctions + +limitBAMsortRAM 0 + int>=0: maximum available RAM (bytes) for sorting BAM. If =0, it will be set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory option. + +limitSjdbInsertNsj 1000000 + int>=0: maximum number of junctions to be inserted to the genome on the fly at the mapping stage, including those from annotations and those detected in the 1st step of the 2-pass run + +limitNreadsSoft -1 + int: soft limit on the number of reads + +### Output: general +outFileNamePrefix ./ + string: output files name prefix (including full or relative path). Can only be defined on the command line. + +outTmpDir - + string: path to a directory that will be used as temporary by STAR. All contents of this directory will be removed! + - ... the temp directory will default to outFileNamePrefix_STARtmp + +outTmpKeep None + string: whether to keep the temporary files after STAR runs is finished + None ... remove all temporary files + All ... keep all files + +outStd Log + string: which output will be directed to stdout (standard out) + Log ... log messages + SAM ... alignments in SAM format (which normally are output to Aligned.out.sam file), normal standard output will go into Log.std.out + BAM_Unsorted ... alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted + BAM_SortedByCoordinate ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype BAM SortedByCoordinate + BAM_Quant ... alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM + +outReadsUnmapped None + string: output of unmapped and partially mapped (i.e. mapped only one mate of a paired end read) reads in separate file(s). + None ... no output + Fastx ... output in separate fasta/fastq files, Unmapped.out.mate1/2 + +outQSconversionAdd 0 + int: add this number to the quality score (e.g. to convert from Illumina to Sanger, use -31) + +outMultimapperOrder Old_2.4 + string: order of multimapping alignments in the output files + Old_2.4 ... quasi-random order used before 2.5.0 + Random ... random order of alignments for each multi-mapper. Read mates (pairs) are always adjacent, all alignment for each read stay together. This option will become default in the future releases. + +### Output: SAM and BAM +outSAMtype SAM + strings: type of SAM/BAM output + 1st word: + BAM ... output BAM without sorting + SAM ... output SAM without sorting + None ... no SAM/BAM output + 2nd, 3rd: + Unsorted ... standard unsorted + SortedByCoordinate ... sorted by coordinate. This option will allocate extra memory for sorting which can be specified by --limitBAMsortRAM. + +outSAMmode Full + string: mode of SAM output + None ... no SAM output + Full ... full SAM output + NoQS ... full SAM but without quality scores + +outSAMstrandField None + string: Cufflinks-like strand field flag + None ... not used + intronMotif ... strand derived from the intron motif. This option changes the output alignments: reads with inconsistent and/or non-canonical introns are filtered out. + +outSAMattributes Standard + string(s): a string of desired SAM attributes, in the order desired for the output SAM. Tags can be listed in any combination/order. + ***Presets: + None ... no attributes + Standard ... NH HI AS nM + All ... NH HI AS nM NM MD jM jI MC ch + ***Alignment: + NH ... number of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard SAM tag. + HI ... multiple alignment index, starts with --outSAMattrIHstart (=1 by default). Standard SAM tag. + AS ... local alignment score, +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE reads, total score for two mates. Stadnard SAM tag. + nM ... number of mismatches. For PE reads, sum over two mates. + NM ... edit distance to the reference (number of mismatched + inserted + deleted bases) for each mate. Standard SAM tag. + MD ... string encoding mismatched and deleted reference bases (see standard SAM specifications). Standard SAM tag. + jM ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions database is used, and a junction is annotated, 20 is added to its motif value. + jI ... start and end of introns for all junctions (1-based). + XS ... alignment strand according to --outSAMstrandField. + MC ... mate's CIGAR string. Standard SAM tag. + ch ... marks all segment of all chimeric alingments for --chimOutType WithinBAM output. + cN ... number of bases clipped from the read ends: 5' and 3' + ***Variation: + vA ... variant allele + vG ... genomic coordinate of the variant overlapped by the read. + vW ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires --waspOutputMode SAMtag. + ha ... haplotype (1/2) when mapping to the diploid genome. Requires genome generated with --genomeTransformType Diploid . + ***STARsolo: + CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs for the solo* demultiplexing. + GX GN ... gene ID and gene name for unique-gene reads. + gx gn ... gene IDs and gene names for unique- and multi-gene reads. + CB UB ... error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM SortedByCoordinate. + sM ... assessment of CB and UMI. + sS ... sequence of the entire barcode (CB,UMI,adapter). + sQ ... quality of the entire barcode. + sF ... type of feature overlap and number of features for each alignment + ***Unsupported/undocumented: + rB ... alignment block read/genomic coordinates. + vR ... read coordinate of the variant. + +outSAMattrIHstart 1 + int>=0: start value for the IH attribute. 0 may be required by some downstream software, such as Cufflinks or StringTie. + +outSAMunmapped None + string(s): output of unmapped reads in the SAM format + 1st word: + None ... no output + Within ... output unmapped reads within the main SAM file (i.e. Aligned.out.sam) + 2nd word: + KeepPairs ... record unmapped mate for each alignment, and, in case of unsorted output, keep it adjacent to its mapped mate. Only affects multi-mapping reads. + +outSAMorder Paired + string: type of sorting for the SAM output + Paired: one mate after the other for all paired alignments + PairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files + +outSAMprimaryFlag OneBestScore + string: which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG + OneBestScore ... only one alignment with the best score is primary + AllBestScore ... all alignments with the best score are primary + +outSAMreadID Standard + string: read ID record type + Standard ... first word (until space) from the FASTx read ID line, removing /1,/2 from the end + Number ... read number (index) in the FASTx file + +outSAMmapqUnique 255 + int: 0 to 255: the MAPQ value for unique mappers + +outSAMflagOR 0 + int: 0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e. FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by STAR, and after outSAMflagAND. Can be used to set specific bits that are not set otherwise. + +outSAMflagAND 65535 + int: 0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e. FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by STAR, but before outSAMflagOR. Can be used to unset specific bits that are not set otherwise. + +outSAMattrRGline - + string(s): SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:", e.g. --outSAMattrRGline ID:xxx CN:yy "DS:z z z". + xxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted. + Comma separated RG lines correspons to different (comma separated) input files in --readFilesIn. Commas have to be surrounded by spaces, e.g. + --outSAMattrRGline ID:xxx , ID:zzz "DS:z z" , ID:yyy DS:yyyy + +outSAMheaderHD - + strings: @HD (header) line of the SAM header + +outSAMheaderPG - + strings: extra @PG (software) line of the SAM header (in addition to STAR) + +outSAMheaderCommentFile - + string: path to the file with @CO (comment) lines of the SAM header + +outSAMfilter None + string(s): filter the output into main SAM/BAM files + KeepOnlyAddedReferences ... only keep the reads for which all alignments are to the extra reference sequences added with --genomeFastaFiles at the mapping stage. + KeepAllAddedReferences ... keep all alignments to the extra reference sequences added with --genomeFastaFiles at the mapping stage. + + +outSAMmultNmax -1 + int: max number of multiple alignments for a read that will be output to the SAM/BAM files. Note that if this value is not equal to -1, the top scoring alignment will be output first + -1 ... all alignments (up to --outFilterMultimapNmax) will be output + +outSAMtlen 1 + int: calculation method for the TLEN field in the SAM/BAM files + 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate. (+)sign for the (+)strand mate + 2 ... leftmost base of any mate to rightmost base of any mate. (+)sign for the mate with the leftmost base. This is different from 1 for overlapping mates with protruding ends + +outBAMcompression 1 + int: -1 to 10 BAM compression level, -1=default compression (6?), 0=no compression, 10=maximum compression + +outBAMsortingThreadN 0 + int: >=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN). + +outBAMsortingBinsN 50 + int: >0: number of genome bins for coordinate-sorting + +### BAM processing +bamRemoveDuplicatesType - + string: mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only + - ... no duplicate removal/marking + UniqueIdentical ... mark all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical + UniqueIdenticalNotMulti ... mark duplicate unique mappers but not multimappers. + +bamRemoveDuplicatesMate2basesN 0 + int>0: number of bases from the 5' of mate 2 to use in collapsing (e.g. for RAMPAGE) + +### Output Wiggle +outWigType None + string(s): type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p". Requires sorted BAM: --outSAMtype BAM SortedByCoordinate . + 1st word: + None ... no signal output + bedGraph ... bedGraph format + wiggle ... wiggle format + 2nd word: + read1_5p ... signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc + read2 ... signal from only 2nd read + +outWigStrand Stranded + string: strandedness of wiggle/bedGraph output + Stranded ... separate strands, str1 and str2 + Unstranded ... collapsed strands + +outWigReferencesPrefix - + string: prefix matching reference names to include in the output wiggle file, e.g. "chr", default "-" - include all references + +outWigNorm RPM + string: type of normalization for the signal + RPM ... reads per million of mapped reads + None ... no normalization, "raw" counts + +### Output Filtering +outFilterType Normal + string: type of filtering + Normal ... standard filtering using only current alignment + BySJout ... keep only those reads that contain junctions that passed filtering into SJ.out.tab + +outFilterMultimapScoreRange 1 + int: the score range below the maximum score for multimapping alignments + +outFilterMultimapNmax 10 + int: maximum number of loci the read is allowed to map to. Alignments (all of them) will be output only if the read maps to no more loci than this value. + Otherwise no alignments will be output, and the read will be counted as "mapped to too many loci" in the Log.final.out . + +outFilterMismatchNmax 10 + int: alignment will be output only if it has no more mismatches than this value. + +outFilterMismatchNoverLmax 0.3 + real: alignment will be output only if its ratio of mismatches to *mapped* length is less than or equal to this value. + +outFilterMismatchNoverReadLmax 1.0 + real: alignment will be output only if its ratio of mismatches to *read* length is less than or equal to this value. + + +outFilterScoreMin 0 + int: alignment will be output only if its score is higher than or equal to this value. + +outFilterScoreMinOverLread 0.66 + real: same as outFilterScoreMin, but normalized to read length (sum of mates' lengths for paired-end reads) + +outFilterMatchNmin 0 + int: alignment will be output only if the number of matched bases is higher than or equal to this value. + +outFilterMatchNminOverLread 0.66 + real: sam as outFilterMatchNmin, but normalized to the read length (sum of mates' lengths for paired-end reads). + +outFilterIntronMotifs None + string: filter alignment using their motifs + None ... no filtering + RemoveNoncanonical ... filter out alignments that contain non-canonical junctions + RemoveNoncanonicalUnannotated ... filter out alignments that contain non-canonical unannotated junctions when using annotated splice junctions database. The annotated non-canonical junctions will be kept. + +outFilterIntronStrands RemoveInconsistentStrands + string: filter alignments + RemoveInconsistentStrands ... remove alignments that have junctions with inconsistent strands + None ... no filtering + +### Output splice junctions (SJ.out.tab) +outSJtype Standard + string: type of splice junction output + Standard ... standard SJ.out.tab output + None ... no splice junction output + +### Output Filtering: Splice Junctions +outSJfilterReads All + string: which reads to consider for collapsed splice junctions output + All ... all reads, unique- and multi-mappers + Unique ... uniquely mapping reads only + +outSJfilterOverhangMin 30 12 12 12 + 4 integers: minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + does not apply to annotated junctions + +outSJfilterCountUniqueMin 3 1 1 1 + 4 integers: minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied + does not apply to annotated junctions + +outSJfilterCountTotalMin 3 1 1 1 + 4 integers: minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif + Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied + does not apply to annotated junctions + +outSJfilterDistToOtherSJmin 10 0 5 10 + 4 integers>=0: minimum allowed distance to other junctions' donor/acceptor + does not apply to annotated junctions + +outSJfilterIntronMaxVsReadN 50000 100000 200000 + N integers>=0: maximum gap allowed for junctions supported by 1,2,3,,,N reads + i.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax + does not apply to annotated junctions + +### Scoring +scoreGap 0 + int: splice junction penalty (independent on intron motif) + +scoreGapNoncan -8 + int: non-canonical junction penalty (in addition to scoreGap) + +scoreGapGCAG -4 + int: GC/AG and CT/GC junction penalty (in addition to scoreGap) + +scoreGapATAC -8 + int: AT/AC and GT/AT junction penalty (in addition to scoreGap) + +scoreGenomicLengthLog2scale -0.25 + int: extra score logarithmically scaled with genomic length of the alignment: scoreGenomicLengthLog2scale*log2(genomicLength) + +scoreDelOpen -2 + int: deletion open penalty + +scoreDelBase -2 + int: deletion extension penalty per base (in addition to scoreDelOpen) + +scoreInsOpen -2 + int: insertion open penalty + +scoreInsBase -2 + int: insertion extension penalty per base (in addition to scoreInsOpen) + +scoreStitchSJshift 1 + int: maximum score reduction while searching for SJ boundaries in the stitching step + + +### Alignments and Seeding + +seedSearchStartLmax 50 + int>0: defines the search start point through the read - the read is split into pieces no longer than this value + +seedSearchStartLmaxOverLread 1.0 + real: seedSearchStartLmax normalized to read length (sum of mates' lengths for paired-end reads) + +seedSearchLmax 0 + int>=0: defines the maximum length of the seeds, if =0 seed length is not limited + +seedMultimapNmax 10000 + int>0: only pieces that map fewer than this value are utilized in the stitching procedure + +seedPerReadNmax 1000 + int>0: max number of seeds per read + +seedPerWindowNmax 50 + int>0: max number of seeds per window + +seedNoneLociPerWindow 10 + int>0: max number of one seed loci per window + +seedSplitMin 12 + int>0: min length of the seed sequences split by Ns or mate gap + +seedMapMin 5 + int>0: min length of seeds to be mapped + +alignIntronMin 21 + int: minimum intron size, genomic gap is considered intron if its length>=alignIntronMin, otherwise it is considered Deletion + +alignIntronMax 0 + int: maximum intron size, if 0, max intron size will be determined by (2^winBinNbits)*winAnchorDistNbins + +alignMatesGapMax 0 + int: maximum gap between two mates, if 0, max intron gap will be determined by (2^winBinNbits)*winAnchorDistNbins + +alignSJoverhangMin 5 + int>0: minimum overhang (i.e. block size) for spliced alignments + +alignSJstitchMismatchNmax 0 -1 0 0 + 4*int>=0: maximum number of mismatches for stitching of the splice junctions (-1: no limit). + (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. + +alignSJDBoverhangMin 3 + int>0: minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments + +alignSplicedMateMapLmin 0 + int>0: minimum mapped length for a read mate that is spliced + +alignSplicedMateMapLminOverLmate 0.66 + real>0: alignSplicedMateMapLmin normalized to mate length + +alignWindowsPerReadNmax 10000 + int>0: max number of windows per read + +alignTranscriptsPerWindowNmax 100 + int>0: max number of transcripts per window + +alignTranscriptsPerReadNmax 10000 + int>0: max number of different alignments per read to consider + +alignEndsType Local + string: type of read ends alignment + Local ... standard local alignment with soft-clipping allowed + EndToEnd ... force end-to-end read alignment, do not soft-clip + Extend5pOfRead1 ... fully extend only the 5p of the read1, all other ends: local alignment + Extend5pOfReads12 ... fully extend only the 5p of the both read1 and read2, all other ends: local alignment + +alignEndsProtrude 0 ConcordantPair + int, string: allow protrusion of alignment ends, i.e. start (end) of the +strand mate downstream of the start (end) of the -strand mate + 1st word: int: maximum number of protrusion bases allowed + 2nd word: string: + ConcordantPair ... report alignments with non-zero protrusion as concordant pairs + DiscordantPair ... report alignments with non-zero protrusion as discordant pairs + +alignSoftClipAtReferenceEnds Yes + string: allow the soft-clipping of the alignments past the end of the chromosomes + Yes ... allow + No ... prohibit, useful for compatibility with Cufflinks + +alignInsertionFlush None + string: how to flush ambiguous insertion positions + None ... insertions are not flushed + Right ... insertions are flushed to the right + +### Paired-End reads +peOverlapNbasesMin 0 + int>=0: minimum number of overlapping bases to trigger mates merging and realignment. Specify >0 value to switch on the "merginf of overlapping mates" algorithm. + +peOverlapMMp 0.01 + real, >=0 & <1: maximum proportion of mismatched bases in the overlap area + +### Windows, Anchors, Binning + +winAnchorMultimapNmax 50 + int>0: max number of loci anchors are allowed to map to + +winBinNbits 16 + int>0: =log2(winBin), where winBin is the size of the bin for the windows/clustering, each window will occupy an integer number of bins. + +winAnchorDistNbins 9 + int>0: max number of bins between two anchors that allows aggregation of anchors into one window + +winFlankNbins 4 + int>0: log2(winFlank), where win Flank is the size of the left and right flanking regions for each window + +winReadCoverageRelativeMin 0.5 + real>=0: minimum relative coverage of the read sequence by the seeds in a window, for STARlong algorithm only. + +winReadCoverageBasesMin 0 + int>0: minimum number of bases covered by the seeds in a window , for STARlong algorithm only. + +### Chimeric Alignments +chimOutType Junctions + string(s): type of chimeric output + Junctions ... Chimeric.out.junction + SeparateSAMold ... output old SAM into separate Chimeric.out.sam file + WithinBAM ... output into main aligned BAM files (Aligned.*.bam) + WithinBAM HardClip ... (default) hard-clipping in the CIGAR for supplemental chimeric alignments (default if no 2nd word is present) + WithinBAM SoftClip ... soft-clipping in the CIGAR for supplemental chimeric alignments + +chimSegmentMin 0 + int>=0: minimum length of chimeric segment length, if ==0, no chimeric output + +chimScoreMin 0 + int>=0: minimum total (summed) score of the chimeric segments + +chimScoreDropMax 20 + int>=0: max drop (difference) of chimeric score (the sum of scores of all chimeric segments) from the read length + +chimScoreSeparation 10 + int>=0: minimum difference (separation) between the best chimeric score and the next one + +chimScoreJunctionNonGTAG -1 + int: penalty for a non-GT/AG chimeric junction + +chimJunctionOverhangMin 20 + int>=0: minimum overhang for a chimeric junction + +chimSegmentReadGapMax 0 + int>=0: maximum gap in the read sequence between chimeric segments + +chimFilter banGenomicN + string(s): different filters for chimeric alignments + None ... no filtering + banGenomicN ... Ns are not allowed in the genome sequence around the chimeric junction + +chimMainSegmentMultNmax 10 + int>=1: maximum number of multi-alignments for the main chimeric segment. =1 will prohibit multimapping main segments. + +chimMultimapNmax 0 + int>=0: maximum number of chimeric multi-alignments + 0 ... use the old scheme for chimeric detection which only considered unique alignments + +chimMultimapScoreRange 1 + int>=0: the score range for multi-mapping chimeras below the best chimeric score. Only works with --chimMultimapNmax > 1 + +chimNonchimScoreDropMin 20 + int>=0: to trigger chimeric detection, the drop in the best non-chimeric alignment score with respect to the read length has to be greater than this value + +chimOutJunctionFormat 0 + int: formatting type for the Chimeric.out.junction file + 0 ... no comment lines/headers + 1 ... comment lines at the end of the file: command line and Nreads: total, unique/multi-mapping + +### Quantification of Annotations +quantMode - + string(s): types of quantification requested + - ... none + TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate file + GeneCounts ... count reads per gene + +quantTranscriptomeBAMcompression 1 + int: -2 to 10 transcriptome BAM compression level + -2 ... no BAM output + -1 ... default compression (6?) + 0 ... no compression + 10 ... maximum compression + +quantTranscriptomeSAMoutput BanSingleEnd_BanIndels_ExtendSoftclip + string: alignment filtering for TranscriptomeSAM output + BanSingleEnd_BanIndels_ExtendSoftclip ... prohibit indels and single-end alignments, extend softclips - compatible with RSEM + BanSingleEnd ... prohibit single-end alignments, allow indels and softclips + BanSingleEnd_ExtendSoftclip ... prohibit single-end alignments, extend softclips, allow indels + + +### 2-pass Mapping +twopassMode None + string: 2-pass mapping mode. + None ... 1-pass mapping + Basic ... basic 2-pass mapping, with all 1st pass junctions inserted into the genome indices on the fly + +twopass1readsN -1 + int: number of reads to process for the 1st step. Use very large number (or default -1) to map all reads in the first step. + + +### WASP parameters +waspOutputMode None + string: WASP allele-specific output type. This is re-implementation of the original WASP mappability filtering by Bryce van de Geijn, Graham McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12, 1061–1063 (2015), https://www.nature.com/articles/nmeth.3582 . + SAMtag ... add WASP tags to the alignments that pass WASP filtering + +### STARsolo (single cell RNA-seq) parameters +soloType None + string(s): type of single-cell RNA-seq + CB_UMI_Simple ... (a.k.a. Droplet) one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X Chromium. + CB_UMI_Complex ... multiple Cell Barcodes of varying length, one UMI of fixed length and one adapter sequence of fixed length are allowed in read2 only (e.g. inDrop, ddSeq). + CB_samTagOut ... output Cell Barcode as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA_read1 [cDNA_read2 if paired-end] CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or SortedByCoordinate] + SmartSeq ... Smart-seq: each cell in a separate FASTQ (paired- or single-end), barcodes are corresponding read-groups, no UMI sequences, alignments deduplicated according to alignment start and end (after extending soft-clipped bases) + +soloCBtype Sequence + string: cell barcode type + Sequence: cell barcode is a sequence (standard option) + String: cell barcode is an arbitrary string + +soloCBwhitelist - + string(s): file(s) with whitelist(s) of cell barcodes. Only --soloType CB_UMI_Complex allows more than one whitelist file. + None ... no whitelist: all cell barcodes are allowed + +soloCBstart 1 + int>0: cell barcode start base + +soloCBlen 16 + int>0: cell barcode length + +soloUMIstart 17 + int>0: UMI start base + +soloUMIlen 10 + int>0: UMI length + +soloBarcodeReadLength 1 + int: length of the barcode read + 1 ... equal to sum of soloCBlen+soloUMIlen + 0 ... not defined, do not check + +soloBarcodeMate 0 + int: identifies which read mate contains the barcode (CB+UMI) sequence + 0 ... barcode sequence is on separate read, which should always be the last file in the --readFilesIn listed + 1 ... barcode sequence is a part of mate 1 + 2 ... barcode sequence is a part of mate 2 + +soloCBposition - + strings(s): position of Cell Barcode(s) on the barcode read. + Presently only works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2. + Format for each barcode: startAnchor_startPosition_endAnchor_endPosition + start(end)Anchor defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter start; 3: adapter end + start(end)Position is the 0-based position with of the CB start(end) with respect to the Anchor Base + String for different barcodes are separated by space. + Example: inDrop (Zilionis et al, Nat. Protocols, 2017): + --soloCBposition 0_0_2_-1 3_1_3_8 + +soloUMIposition - + string: position of the UMI on the barcode read, same as soloCBposition + Example: inDrop (Zilionis et al, Nat. Protocols, 2017): + --soloCBposition 3_9_3_14 + +soloAdapterSequence - + string: adapter sequence to anchor barcodes. Only one adapter sequence is allowed. + +soloAdapterMismatchesNmax 1 + int>0: maximum number of mismatches allowed in adapter sequence. + +soloCBmatchWLtype 1MM_multi + string: matching the Cell Barcodes to the WhiteList + Exact ... only exact matches allowed + 1MM ... only one match in whitelist with 1 mismatched base allowed. Allowed CBs have to have at least one read with exact match. + 1MM_multi ... multiple matches in whitelist with 1 mismatched base allowed, posterior probability calculation is used choose one of the matches. + Allowed CBs have to have at least one read with exact match. This option matches best with CellRanger 2.2.0 + 1MM_multi_pseudocounts ... same as 1MM_Multi, but pseudocounts of 1 are added to all whitelist barcodes. + 1MM_multi_Nbase_pseudocounts ... same as 1MM_multi_pseudocounts, multimatching to WL is allowed for CBs with N-bases. This option matches best with CellRanger >= 3.0.0 + EditDist_2 ... allow up to edit distance of 3 fpr each of the barcodes. May include one deletion + one insertion. Only works with --soloType CB_UMI_Complex. Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio Split-seq pipeline. + +soloInputSAMattrBarcodeSeq - + string(s): when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode sequence (in proper order). + For instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR . + This parameter is required when running STARsolo with input from SAM. + +soloInputSAMattrBarcodeQual - + string(s): when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode qualities (in proper order). + For instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual CY UY . + If this parameter is '-' (default), the quality 'H' will be assigned to all bases. + +soloStrand Forward + string: strandedness of the solo libraries: + Unstranded ... no strand information + Forward ... read strand same as the original RNA molecule + Reverse ... read strand opposite to the original RNA molecule + +soloFeatures Gene + string(s): genomic features for which the UMI counts per Cell Barcode are collected + Gene ... genes: reads match the gene transcript + SJ ... splice junctions: reported in SJ.out.tab + GeneFull ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns + GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns: prioritize 100% overlap with exons + GeneFull_Ex50pAS ... full gene (pre-RNA): count all reads overlapping genes' exons and introns: prioritize >50% overlap with exons. Do not count reads with 100% exonic overlap in the antisense direction. + +#####UnderDevelopment_begin : not supported - do not use + Transcript3p ... quantification of transcript for 3' protocols +#####UnderDevelopment_end + +soloMultiMappers Unique + string(s): counting method for reads mapping to multiple genes + Unique ... count only reads that map to unique genes + Uniform ... uniformly distribute multi-genic UMIs to all genes + Rescue ... distribute UMIs proportionally to unique+uniform counts (~ first iteration of EM) + PropUnique ... distribute UMIs proportionally to unique mappers, if present, and uniformly if not. + EM ... multi-gene UMIs are distributed using Expectation Maximization algorithm + +soloUMIdedup 1MM_All + string(s): type of UMI deduplication (collapsing) algorithm + 1MM_All ... all UMIs with 1 mismatch distance to each other are collapsed (i.e. counted once). + 1MM_Directional_UMItools ... follows the "directional" method from the UMI-tools by Smith, Heger and Sudbery (Genome Research 2017). + 1MM_Directional ... same as 1MM_Directional_UMItools, but with more stringent criteria for duplicate UMIs + Exact ... only exactly matching UMIs are collapsed. + NoDedup ... no deduplication of UMIs, count all reads. + 1MM_CR ... CellRanger2-4 algorithm for 1MM UMI collapsing. + +soloUMIfiltering - + string(s): type of UMI filtering (for reads uniquely mapping to genes) + - ... basic filtering: remove UMIs with N and homopolymers (similar to CellRanger 2.2.0). + MultiGeneUMI ... basic + remove lower-count UMIs that map to more than one gene. + MultiGeneUMI_All ... basic + remove all UMIs that map to more than one gene. + MultiGeneUMI_CR ... basic + remove lower-count UMIs that map to more than one gene, matching CellRanger > 3.0.0 . + Only works with --soloUMIdedup 1MM_CR + +soloOutFileNames Solo.out/ features.tsv barcodes.tsv matrix.mtx + string(s): file names for STARsolo output: + file_name_prefix gene_names barcode_sequences cell_feature_count_matrix + +soloCellFilter CellRanger2.2 3000 0.99 10 + string(s): cell filtering type and parameters + None ... do not output filtered cells + TopCells ... only report top cells by UMI count, followed by the exact number of cells + CellRanger2.2 ... simple filtering of CellRanger 2.2. + Can be followed by numbers: number of expected cells, robust maximum percentile for UMI count, maximum to minimum ratio for UMI count + The harcoded values are from CellRanger: nExpectedCells=3000; maxPercentile=0.99; maxMinRatio=10 + EmptyDrops_CR ... EmptyDrops filtering in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y + Can be followed by 10 numeric parameters: nExpectedCells maxPercentile maxMinRatio indMin indMax umiMin umiMinFracMedian candMaxN FDR simN + The harcoded values are from CellRanger: 3000 0.99 10 45000 90000 500 0.01 20000 0.01 10000 + +soloOutFormatFeaturesGeneField3 "Gene Expression" + string(s): field 3 in the Gene features.tsv file. If "-", then no 3rd field is output. + +soloCellReadStats None + string: Output reads statistics for each CB + Standard ... standard output + +#####UnderDevelopment_begin : not supported - do not use +soloClusterCBfile - + string: file containing the cluster information for cell barcodes, two columns: CB cluster_index. Only used with --soloFeatures Transcript3p +#####UnderDevelopment_end diff --git a/src/star/star_align_reads/script.py b/src/star/star_align_reads/script.py new file mode 100644 index 00000000..2bde8798 --- /dev/null +++ b/src/star/star_align_reads/script.py @@ -0,0 +1,109 @@ +import tempfile +import subprocess +import shutil +from pathlib import Path + +## VIASH START +par = { + "input": [ + "src/star/star_align_reads/test_data/a_R1.1.fastq", + "src/star/star_align_reads/test_data/a_R1.2.fastq", + ], + "input_r2": [ + "src/star/star_align_reads/test_data/a_R2.1.fastq", + "src/star/star_align_reads/test_data/a_R2.2.fastq", + ], + "genomeDir": "src/star/star_align_reads/test_data/genome.fasta", + "aligned_reads": "aligned_reads.sam" +} +meta = { + "cpus": 8, + "temp_dir": "/tmp" +} +## VIASH END + +################################################## +# check and process SE / PE R1 input files +input_r1 = par["input"] +readFilesIn = ",".join(par["input"]) +par["input"] = None + +# check and process PE R2 input files +input_r2 = par["input_r2"] +if input_r2 is not None: + if len(input_r1) != len(input_r2): + raise ValueError("The number of R1 and R2 files do not match.") + readFilesIn = [readFilesIn, ",".join(par["input_r2"])] + par["input_r2"] = None + +# store readFilesIn +par["readFilesIn"] = readFilesIn + +################################################## + +# determine readFilesCommand +if input_r1[0].endswith(".gz"): + print(">> Input files are gzipped, setting readFilesCommand to zcat", flush=True) + par["readFilesCommand"] = "zcat" +elif input_r1[0].endswith(".bz2"): + print(">> Input files are bzipped, setting readFilesCommand to bzcat", flush=True) + par["readFilesCommand"] = "bzcat" + +################################################## +# store output paths +expected_outputs = { + "aligned_reads": ["Aligned.out.sam", "Aligned.out.bam"], + "reads_per_gene": "ReadsPerGene.out.tab", + "chimeric_junctions": "Chimeric.out.junction", + "log": "Log.final.out", + "splice_junctions": "SJ.out.tab", + "unmapped": "Unmapped.out.mate1", + "unmapped_r2": "Unmapped.out.mate2" +} +output_paths = {name: par[name] for name in expected_outputs.keys()} +for name in expected_outputs.keys(): + par[name] = None + +################################################## +# process other args +par["runMode"] = "alignReads" + +if "cpus" in meta and meta["cpus"]: + par["runThreadN"] = meta["cpus"] + +################################################## +# run STAR and move output to final destination +with tempfile.TemporaryDirectory(prefix="star-", dir=meta["temp_dir"], ignore_cleanup_errors=True) as temp_dir: + print(">> Constructing command", flush=True) + + # set output paths + temp_dir = Path(temp_dir) + par["outTmpDir"] = temp_dir / "tempdir" + out_dir = temp_dir / "out" + par["outFileNamePrefix"] = f"{out_dir}/" # star needs this slash + + # construct command + cmd_args = [ "STAR" ] + for name, value in par.items(): + if value is not None: + val_to_add = value if isinstance(value, list) else [value] + cmd_args.extend([f"--{name}"] + [str(x) for x in val_to_add]) + print("", flush=True) + + # run command + print(">> Running STAR with command:", flush=True) + print(f"+ {' '.join(cmd_args)}", end="\n\n", flush=True) + subprocess.run( + cmd_args, + check=True + ) + print(">> STAR finished successfully", end="\n\n", flush=True) + + # move output to final destination + print(">> Moving output to final destination", flush=True) + for name, paths in expected_outputs.items(): + for expected_path in [paths] if isinstance(paths, str) else paths: + expected_full_path = out_dir / expected_path + if output_paths[name] and expected_full_path.is_file(): + print(f">> Moving {expected_path} to {output_paths[name]}", flush=True) + shutil.move(expected_full_path, output_paths[name]) diff --git a/src/star/star_align_reads/test.sh b/src/star/star_align_reads/test.sh new file mode 100644 index 00000000..374b9014 --- /dev/null +++ b/src/star/star_align_reads/test.sh @@ -0,0 +1,173 @@ +#!/bin/bash + +set -e + +## VIASH START +meta_executable="target/docker/star/star_align_reads/star_align_reads" +meta_resources_dir="src/star/star_align_reads" +## VIASH END + +######################################################################################### + +# helper functions +assert_file_exists() { + [ -f "$1" ] || (echo "File '$1' does not exist" && exit 1) +} +assert_file_doesnt_exist() { + [ ! -f "$1" ] || (echo "File '$1' exists but shouldn't" && exit 1) +} +assert_file_empty() { + [ ! -s "$1" ] || (echo "File '$1' is not empty but should be" && exit 1) +} +assert_file_not_empty() { + [ -s "$1" ] || (echo "File '$1' is empty but shouldn't be" && exit 1) +} +assert_file_contains() { + grep -q "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) +} +assert_file_not_contains() { + grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) +} +assert_file_contains_regex() { + grep -q -E "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) +} +assert_file_not_contains_regex() { + grep -q -E "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) +} + +######################################################################################### +echo "> Prepare test data" + +cat > reads_R1.fastq <<'EOF' +@SEQ_ID1 +ACGCTGCCTCATAAGCCTCACACAT ++ +IIIIIIIIIIIIIIIIIIIIIIIII +@SEQ_ID2 +ACCCGCAAGATTAGGCTCCGTACAC ++ +!!!!!!!!!!!!!!!!!!!!!!!!! +EOF + +cat > reads_R2.fastq <<'EOF' +@SEQ_ID1 +ATGTGTGAGGCTTATGAGGCAGCGT ++ +IIIIIIIIIIIIIIIIIIIIIIIII +@SEQ_ID2 +GTGTACGGAGCCTAATCTTGCAGGG ++ +!!!!!!!!!!!!!!!!!!!!!!!!! +EOF + +cat > genome.fasta <<'EOF' +>chr1 +TGGCATGAGCCAACGAACGCTGCCTCATAAGCCTCACACATCCGCGCCTATGTTGTGACTCTCTGTGAGCGTTCGTGGG +GCTCGTCACCACTATGGTTGGCCGGTTAGTAGTGTGACTCCTGGTTTTCTGGAGCTTCTTTAAACCGTAGTCCAGTCAA +TGCGAATGGCACTTCACGACGGACTGTCCTTAGCTCAGGGGA +EOF + +cat > genes.gtf <<'EOF' +chr1 example_source gene 0 50 . + . gene_id "gene1"; transcript_id "transcript1"; +chr1 example_source exon 20 40 . + . gene_id "gene1"; transcript_id "transcript1"; +EOF + +echo "> Generate index" +STAR \ + ${meta_cpus:+--runThreadN $meta_cpus} \ + --runMode genomeGenerate \ + --genomeDir "index/" \ + --genomeFastaFiles "genome.fasta" \ + --sjdbGTFfile "genes.gtf" \ + --genomeSAindexNbases 2 + +######################################################################################### + +mkdir star_align_reads_se +cd star_align_reads_se + +echo "> Run star_align_reads on SE" +"$meta_executable" \ + --input "../reads_R1.fastq" \ + --genomeDir "../index/" \ + --aligned_reads "output.sam" \ + --log "log.txt" \ + --outReadsUnmapped "Fastx" \ + --unmapped "unmapped.sam" \ + --quantMode "TranscriptomeSAM;GeneCounts" \ + --reads_per_gene "reads_per_gene.tsv" \ + --outSJtype Standard \ + --splice_junctions "splice_junctions.tsv" \ + ${meta_cpus:+---cpus $meta_cpus} + +# TODO: Test data doesn't contain any chimeric reads yet +# --chimOutType "Junctions" \ +# --chimeric_junctions "chimeric_junctions.tsv" \ + +echo ">> Check if output exists" +assert_file_exists "output.sam" +assert_file_exists "log.txt" +assert_file_exists "reads_per_gene.tsv" +# assert_file_exists "chimeric_junctions.tsv" +assert_file_exists "splice_junctions.tsv" +assert_file_exists "unmapped.sam" + +echo ">> Check if output contents are not empty" +assert_file_not_empty "output.sam" +assert_file_not_empty "log.txt" +assert_file_not_empty "reads_per_gene.tsv" +# assert_file_not_empty "chimeric_junctions.tsv" +# assert_file_not_empty "splice_junctions.tsv" # TODO: test data doesn't contain any splice junctions yet +assert_file_not_empty "unmapped.sam" + +echo ">> Check if output contents are correct" +assert_file_contains "log.txt" "Number of input reads \\| 2" +assert_file_contains "log.txt" "Number of reads unmapped: too short \\| 1" +assert_file_contains "log.txt" "Uniquely mapped reads number \\| 1" +assert_file_contains "reads_per_gene.tsv" "gene1 1 1 0" +assert_file_contains "reads_per_gene.tsv" "N_unmapped 1 1 1" +assert_file_contains "output.sam" "SEQ_ID1 0 chr1 17 255 25M \\* 0 0 ACGCTGCCTCATAAGCCTCACACAT IIIIIIIIIIIIIIIIIIIIIIIII NH:i:1 HI:i:1 AS:i:24 nM:i:0" +assert_file_contains "unmapped.sam" "@SEQ_ID2 0:N:" +assert_file_contains "unmapped.sam" "ACCCGCAAGATTAGGCTCCGTACAC" + +cd .. + +######################################################################################### + +mkdir star_align_reads_pe_minimal +cd star_align_reads_pe_minimal + +echo ">> Run star_align_reads on PE" +"$meta_executable" \ + --input ../reads_R1.fastq \ + --input_r2 ../reads_R2.fastq \ + --genomeDir ../index/ \ + --aligned_reads output.bam \ + --log log.txt \ + --outReadsUnmapped Fastx \ + --unmapped unmapped_r1.bam \ + --unmapped_r2 unmapped_r2.bam \ + ${meta_cpus:+---cpus $meta_cpus} + +echo ">> Check if output exists" +assert_file_exists "output.bam" +assert_file_exists "log.txt" +assert_file_exists "unmapped_r1.bam" +assert_file_exists "unmapped_r2.bam" + +echo ">> Check if output contents are not empty" +assert_file_not_empty "output.bam" +assert_file_not_empty "log.txt" +assert_file_not_empty "unmapped_r1.bam" +assert_file_not_empty "unmapped_r2.bam" + +echo ">> Check if output contents are correct" +assert_file_contains "log.txt" "Number of input reads \\| 2" +assert_file_contains "log.txt" "Number of reads unmapped: too short \\| 1" +assert_file_contains "log.txt" "Uniquely mapped reads number \\| 1" + +cd .. + +######################################################################################### + +echo "> Test successful" diff --git a/src/star/star_align_reads/utils/process_params.R b/src/star/star_align_reads/utils/process_params.R new file mode 100644 index 00000000..ccdc50b3 --- /dev/null +++ b/src/star/star_align_reads/utils/process_params.R @@ -0,0 +1,189 @@ +library(tidyverse) + +# This script processes the STAR aligner's help file +# to create a viash argument_groups.yaml file. + +local_file <- "src/star/star_align_reads/help.txt" +yaml_file <- "src/star/star_align_reads/argument_groups.yaml" + +param_txt <- readr::read_lines(local_file) + +# replace non-ascii characters with their ascii approximations +param_txt <- iconv(param_txt, "UTF-8", "ASCII//TRANSLIT") + +dev_begin <- grep("#####UnderDevelopment_begin", param_txt) +dev_end <- grep("#####UnderDevelopment_end", param_txt) + +# strip development sections +nondev_ix <- unlist(map2(c(1, dev_end + 1), c(dev_begin - 1, length(param_txt)), function(i, j) { + if (i >= 1 && i < j) { + seq(i, j, 1) + } else { + NULL + } +})) + +param_txt2 <- param_txt[nondev_ix] + +# strip comments +param_txt3 <- param_txt2[-grep("^#[^#]", param_txt2)] + +# detect groups +group_ix <- grep("^### ", param_txt3) + +out <- map2_dfr( + group_ix, + c(group_ix[-1] - 1, length(param_txt3)), + function(group_start, group_end) { + # cat("group_start <- ", group_start, "; group_end <- ", group_end, "\n", sep = "") + group_name <- gsub("^### ", "", param_txt3[[group_start]]) + + group_txt <- param_txt3[seq(group_start + 1, group_end)] + + arg_ix <- grep("^[^ ]", group_txt) + + arguments <- map2_dfr( + arg_ix, + c(arg_ix[-1] - 1, length(group_txt)), + function(arg_start, arg_end) { + # cat("arg_start <- ", arg_start, "; arg_end <- ", arg_end, "\n", sep = "") + + # process name and default + first_txt <- group_txt[[arg_start]] + first_regex <- "^([^ ]*) +(.*) *$" + if (!grepl(first_regex, first_txt)) { + stop("Line '", first_txt, "' did not match regex '", first_regex, "'") + } + name <- gsub(first_regex, "\\1", first_txt) + default <- gsub(first_regex, "\\2", first_txt) + + # process type and first description + second_txt <- group_txt[[arg_start + 1]] + second_regex <- "^ +([^:]*):[ ]+(.*)$" + if (!grepl(second_regex, second_txt)) { + stop("Line '", second_txt, "' did not match regex '", second_regex, "'") + } + type <- gsub(second_regex, "\\1", second_txt) + desc_start <- str_trim(gsub(second_regex, "\\2", second_txt)) + + # process more description + desc_cont1 <- group_txt[seq(arg_start + 2, arg_end)] + + desc <- + if (sum(str_length(desc_cont1)) == 0) { + desc_start + } else { + # detect margin + margins <- str_extract(desc_cont1, "^( +)") %>% na.omit + margin <- margins[[which.min(str_length(margins))]] + desc_cont2 <- gsub(paste0("^", margin), "", desc_cont1) + desc_cont3 <- ifelse(grepl("\\.\\.\\.", desc_cont2), paste0("- ", desc_cont2), desc_cont2) + desc_cont4 <- str_trim(desc_cont3) + + # construct desc + str_trim(paste0(c(desc_start, "", desc_cont4), "\n", collapse = "")) + } + + tibble( + group_name, + name, + default, + type, + description = desc + ) + } + ) + + arguments + } +) + +# todo: manually fix alignEndsProtrude? +# assigning types +type_map <- c("string" = "string", "int" = "integer", "real" = "double", "double" = "double", "int, string" = "string") +file_args <- c("genomeDir", "readFilesIn", "sjdbGTFfile", "genomeFastaFiles", "genomeChainFiles", "readFilesManifest") +long_args <- c("limitGenomeGenerateRAM", "limitIObufferSize", "limitOutSAMoneReadBytes", "limitBAMsortRAM") +required_args <- c("genomeDir", "readFilesIn") + +# converting examples +as_safe_int <- function(x) tryCatch({as.integer(x)}, warning = function(e) { bit64::as.integer64(x) }) +safe_split <- function(x) strsplit(x, "'[^']*'(*SKIP)(*F)|\"[^\"]*\"(*SKIP)(*F)|\\s+", perl = TRUE)[[1]] %>% gsub("^[\"']|[\"']$", "", .) +trafos <- list( + string = function(x) x, + integer = as_safe_int, + double = as.numeric, + strings = function(x) safe_split(x), + integers = function(x) sapply(safe_split(x), as_safe_int), + doubles = function(x) as.numeric(safe_split(x)) +) +# remove arguments that are not relevant for viash +removed_args <- c("versionGenome", "parametersFiles", "sysShell", "runDirPerm") +# these settings are defined by the viash component +manual_args <- c("runThreadN", "outTmpDir", "runMode", "outFileNamePrefix", "readFilesIn") + +# make viash-like values +out2 <- out %>% + # remove arguments that are not relevant for viash + filter(!name %in% c(removed_args, manual_args)) %>% + # remove arguments that are related to a different runmode + filter(!grepl("--runMode", description) | grepl("--runMode alignReads", description)) %>% + filter(!grepl("--runMode", group_name) | grepl("--runMode alignReads", group_name)) %>% + filter(!grepl("STARsolo", group_name)) %>% + mutate( + viash_arg = paste0("--", name), + type_step1 = type %>% + str_replace_all(".*(int, string|string|int|real|double)\\(?(s?).*", "\\1\\2"), + viash_type = type_map[gsub("(int, string|string|int|real|double).*", "\\1", type_step1)], + multiple = type_step1 == "int, string" | grepl("s$", type_step1) | grepl("^[4N][\\* ]", type), + default_step1 = default %>% + {ifelse(. %in% c("-", "None"), NA_character_, .)}, + viash_default = + mapply( + default_step1, + paste0(viash_type, ifelse(multiple, "s", "")), + FUN = function(str, typ) trafos[[typ]](str) + ), + # viash_type = ifelse(sapply(viash_default, bit64::is.integer64), "long", viash_type), + # update type + viash_type = case_when( + name %in% long_args ~ "long", + name %in% file_args ~ "file", + TRUE ~ viash_type + ), + # turn longs into character because yaml::write_yaml doesn't handle longs well + viash_default = ifelse(sapply(viash_default, bit64::is.integer64), map(viash_default, as.character), viash_default), + group_name = gsub(" - .*", "", group_name), + required = ifelse(name %in% required_args, TRUE, NA) + ) +print(out2, n = 200) +out2 %>% mutate(i = row_number()) %>% + # filter(is.na(default_step1) != is.na(viash_default)) %>% + select(-group_name, -description) + +out2 %>% filter(!grepl("--runMode", description) | grepl("--runMode alignReads", description)) + +argument_groups <- map(unique(out2$group_name), function(group_name) { + args <- out2 %>% + filter(group_name == !!group_name) %>% + pmap(function(viash_arg, viash_type, multiple, viash_default, description, required, ...) { + li <- lst( + name = viash_arg, + type = viash_type, + description = description + ) + if (all(!is.na(viash_default))) { + li$example <- viash_default + } + if (!is.na(multiple) && multiple) { + li$multiple <- multiple + li$multiple_sep <- ";" + } + if (!is.na(required) && required) { + li$required <- required + } + li + }) + list(name = group_name, arguments = args) +}) + +yaml::write_yaml(list(argument_groups = argument_groups), yaml_file)