From 8acef309bf08b3c99c720b54c2ebdda433bdc710 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Tue, 27 Feb 2024 14:29:21 +0100
Subject: [PATCH] Add star_align_reads component (#22)

* add star align component

* refactor script

* refactor variables and script

* ensure zcat and bzcat are also installed

* change utf8 into ascii

* clean up quotes

* fix renamed argument

* better utf8 to ascii conversion

* update format to viash 0.9

* add star version to docker

* add TODOs

* add newline

* wip check contents of star output

* remove test data

* rename component

* remove todo

* remove starsolo params

* Apply suggestions from code review

Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>

* add missing slash

---------

Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com>
---
 CHANGELOG.md                                  |    2 +
 .../star_align_reads/argument_groups.yaml     | 1088 +++++++++++++++++
 src/star/star_align_reads/config.vsh.yaml     |  115 ++
 src/star/star_align_reads/help.txt            |  927 ++++++++++++++
 src/star/star_align_reads/script.py           |  109 ++
 src/star/star_align_reads/test.sh             |  173 +++
 .../star_align_reads/utils/process_params.R   |  189 +++
 7 files changed, 2603 insertions(+)
 create mode 100644 src/star/star_align_reads/argument_groups.yaml
 create mode 100644 src/star/star_align_reads/config.vsh.yaml
 create mode 100644 src/star/star_align_reads/help.txt
 create mode 100644 src/star/star_align_reads/script.py
 create mode 100644 src/star/star_align_reads/test.sh
 create mode 100644 src/star/star_align_reads/utils/process_params.R

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0f6fd740..bd6a639a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,6 +27,8 @@
 
 * `lofreq/indelqual`: Insert indel qualities into BAM file (PR #17).
 
+* `star/star_align_reads`: Align reads to a reference genome (PR #22).
+
 ## MAJOR CHANGES
 
 ## MINOR CHANGES
diff --git a/src/star/star_align_reads/argument_groups.yaml b/src/star/star_align_reads/argument_groups.yaml
new file mode 100644
index 00000000..e6a1c874
--- /dev/null
+++ b/src/star/star_align_reads/argument_groups.yaml
@@ -0,0 +1,1088 @@
+argument_groups:
+- name: Run Parameters
+  arguments:
+  - name: --runRNGseed
+    type: integer
+    description: random number generator seed.
+    example: 777
+- name: Genome Parameters
+  arguments:
+  - name: --genomeDir
+    type: file
+    description: path to the directory where genome files are stored (for --runMode
+      alignReads) or will be generated (for --runMode generateGenome)
+    example: ./GenomeDir/
+    required: yes
+  - name: --genomeLoad
+    type: string
+    description: |-
+      mode of shared memory usage for the genome files. Only used with --runMode alignReads.
+
+      - LoadAndKeep     ... load genome into shared and keep it in memory after run
+      - LoadAndRemove   ... load genome into shared but remove it after run
+      - LoadAndExit     ... load genome into shared memory and exit, keeping the genome in memory for future runs
+      - Remove          ... do not map anything, just remove loaded genome from memory
+      - NoSharedMemory  ... do not use shared memory, each job will have its own private copy of the genome
+    example: NoSharedMemory
+  - name: --genomeFastaFiles
+    type: file
+    description: |-
+      path(s) to the fasta files with the genome sequences, separated by spaces. These files should be plain text FASTA files, they *cannot* be zipped.
+
+      Required for the genome generation (--runMode genomeGenerate). Can also be used in the mapping (--runMode alignReads) to add extra (new) sequences to the genome (e.g. spike-ins).
+    multiple: yes
+    multiple_sep: ;
+  - name: --genomeFileSizes
+    type: integer
+    description: genome files exact sizes in bytes. Typically, this should not be
+      defined by the user.
+    example: 0
+    multiple: yes
+    multiple_sep: ;
+  - name: --genomeTransformOutput
+    type: string
+    description: |-
+      which output to transform back to original genome
+
+      - SAM     ... SAM/BAM alignments
+      - SJ      ... splice junctions (SJ.out.tab)
+      - Quant   ... quantifications (from --quantMode option)
+      - None    ... no transformation of the output
+    multiple: yes
+    multiple_sep: ;
+  - name: --genomeChrSetMitochondrial
+    type: string
+    description: names of the mitochondrial chromosomes. Presently only used for STARsolo
+      statistics output/
+    example:
+    - chrM
+    - M
+    - MT
+    multiple: yes
+    multiple_sep: ;
+- name: Splice Junctions Database
+  arguments:
+  - name: --sjdbFileChrStartEnd
+    type: string
+    description: path to the files with genomic coordinates (chr <tab> start <tab>
+      end <tab> strand) for the splice junction introns. Multiple files can be supplied
+      and will be concatenated.
+    multiple: yes
+    multiple_sep: ;
+  - name: --sjdbGTFfile
+    type: file
+    description: path to the GTF file with annotations
+  - name: --sjdbGTFchrPrefix
+    type: string
+    description: prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL
+      annotations with UCSC genomes)
+  - name: --sjdbGTFfeatureExon
+    type: string
+    description: feature type in GTF file to be used as exons for building transcripts
+    example: exon
+  - name: --sjdbGTFtagExonParentTranscript
+    type: string
+    description: GTF attribute name for parent transcript ID (default "transcript_id"
+      works for GTF files)
+    example: transcript_id
+  - name: --sjdbGTFtagExonParentGene
+    type: string
+    description: GTF attribute name for parent gene ID (default "gene_id" works for
+      GTF files)
+    example: gene_id
+  - name: --sjdbGTFtagExonParentGeneName
+    type: string
+    description: GTF attribute name for parent gene name
+    example: gene_name
+    multiple: yes
+    multiple_sep: ;
+  - name: --sjdbGTFtagExonParentGeneType
+    type: string
+    description: GTF attribute name for parent gene type
+    example:
+    - gene_type
+    - gene_biotype
+    multiple: yes
+    multiple_sep: ;
+  - name: --sjdbOverhang
+    type: integer
+    description: length of the donor/acceptor sequence on each side of the junctions,
+      ideally = (mate_length - 1)
+    example: 100
+  - name: --sjdbScore
+    type: integer
+    description: extra alignment score for alignments that cross database junctions
+    example: 2
+  - name: --sjdbInsertSave
+    type: string
+    description: |-
+      which files to save when sjdb junctions are inserted on the fly at the mapping step
+
+      - Basic ... only small junction / transcript files
+      - All   ... all files including big Genome, SA and SAindex - this will create a complete genome directory
+    example: Basic
+- name: Variation parameters
+  arguments:
+  - name: --varVCFfile
+    type: string
+    description: path to the VCF file that contains variation data. The 10th column
+      should contain the genotype information, e.g. 0/1
+- name: Read Parameters
+  arguments:
+  - name: --readFilesType
+    type: string
+    description: |-
+      format of input read files
+
+      - Fastx       ... FASTA or FASTQ
+      - SAM SE      ... SAM or BAM single-end reads; for BAM use --readFilesCommand samtools view
+      - SAM PE      ... SAM or BAM paired-end reads; for BAM use --readFilesCommand samtools view
+    example: Fastx
+  - name: --readFilesSAMattrKeep
+    type: string
+    description: |-
+      for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL
+
+      - All     ... keep all tags
+      - None    ... do not keep any tags
+    example: All
+    multiple: yes
+    multiple_sep: ;
+  - name: --readFilesManifest
+    type: file
+    description: |-
+      path to the "manifest" file with the names of read files. The manifest file should contain 3 tab-separated columns:
+
+      paired-end reads: read1_file_name $tab$ read2_file_name $tab$ read_group_line.
+      single-end reads: read1_file_name $tab$ -               $tab$ read_group_line.
+      Spaces, but not tabs are allowed in file names.
+      If read_group_line does not start with ID:, it can only contain one ID field, and ID: will be added to it.
+      If read_group_line starts with ID:, it can contain several fields separated by $tab$, and all fields will be be copied verbatim into SAM @RG header line.
+  - name: --readFilesPrefix
+    type: string
+    description: prefix for the read files names, i.e. it will be added in front of
+      the strings in --readFilesIn
+  - name: --readFilesCommand
+    type: string
+    description: |-
+      command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout
+
+      For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc.
+    multiple: yes
+    multiple_sep: ;
+  - name: --readMapNumber
+    type: integer
+    description: |-
+      number of reads to map from the beginning of the file
+
+      -1: map all reads
+    example: -1
+  - name: --readMatesLengthsIn
+    type: string
+    description: Equal/NotEqual - lengths of names,sequences,qualities for both mates
+      are the same  / not the same. NotEqual is safe in all situations.
+    example: NotEqual
+  - name: --readNameSeparator
+    type: string
+    description: character(s) separating the part of the read names that will be trimmed
+      in output (read name after space is always trimmed)
+    example: /
+    multiple: yes
+    multiple_sep: ;
+  - name: --readQualityScoreBase
+    type: integer
+    description: number to be subtracted from the ASCII code to get Phred quality
+      score
+    example: 33
+- name: Read Clipping
+  arguments:
+  - name: --clipAdapterType
+    type: string
+    description: |-
+      adapter clipping type
+
+      - Hamming ... adapter clipping based on Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp
+      - CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes Opal package by Martin Sosic: https://github.com/Martinsos/opal
+      - None ... no adapter clipping, all other clip* parameters are disregarded
+    example: Hamming
+  - name: --clip3pNbases
+    type: integer
+    description: number(s) of bases to clip from 3p of each mate. If one value is
+      given, it will be assumed the same for both mates.
+    example: 0
+    multiple: yes
+    multiple_sep: ;
+  - name: --clip3pAdapterSeq
+    type: string
+    description: |-
+      adapter sequences to clip from 3p of each mate.  If one value is given, it will be assumed the same for both mates.
+
+      - polyA ... polyA sequence with the length equal to read length
+    multiple: yes
+    multiple_sep: ;
+  - name: --clip3pAdapterMMp
+    type: double
+    description: max proportion of mismatches for 3p adapter clipping for each mate.  If
+      one value is given, it will be assumed the same for both mates.
+    example: 0.1
+    multiple: yes
+    multiple_sep: ;
+  - name: --clip3pAfterAdapterNbases
+    type: integer
+    description: number of bases to clip from 3p of each mate after the adapter clipping.
+      If one value is given, it will be assumed the same for both mates.
+    example: 0
+    multiple: yes
+    multiple_sep: ;
+  - name: --clip5pNbases
+    type: integer
+    description: number(s) of bases to clip from 5p of each mate. If one value is
+      given, it will be assumed the same for both mates.
+    example: 0
+    multiple: yes
+    multiple_sep: ;
+- name: Limits
+  arguments:
+  - name: --limitGenomeGenerateRAM
+    type: long
+    description: maximum available RAM (bytes) for genome generation
+    example: '31000000000'
+  - name: --limitIObufferSize
+    type: long
+    description: max available buffers size (bytes) for input/output, per thread
+    example:
+    - 30000000
+    - 50000000
+    multiple: yes
+    multiple_sep: ;
+  - name: --limitOutSAMoneReadBytes
+    type: long
+    description: 'max size of the SAM record (bytes) for one read. Recommended value:
+      >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax'
+    example: 100000
+  - name: --limitOutSJoneRead
+    type: integer
+    description: max number of junctions for one read (including all multi-mappers)
+    example: 1000
+  - name: --limitOutSJcollapsed
+    type: integer
+    description: max number of collapsed junctions
+    example: 1000000
+  - name: --limitBAMsortRAM
+    type: long
+    description: maximum available RAM (bytes) for sorting BAM. If =0, it will be
+      set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory
+      option.
+    example: 0
+  - name: --limitSjdbInsertNsj
+    type: integer
+    description: maximum number of junctions to be inserted to the genome on the fly
+      at the mapping stage, including those from annotations and those detected in
+      the 1st step of the 2-pass run
+    example: 1000000
+  - name: --limitNreadsSoft
+    type: integer
+    description: soft limit on the number of reads
+    example: -1
+- name: 'Output: general'
+  arguments:
+  - name: --outTmpKeep
+    type: string
+    description: |-
+      whether to keep the temporary files after STAR runs is finished
+
+      - None ... remove all temporary files
+      - All ... keep all files
+  - name: --outStd
+    type: string
+    description: |-
+      which output will be directed to stdout (standard out)
+
+      - Log                    ... log messages
+      - SAM                    ... alignments in SAM format (which normally are output to Aligned.out.sam file), normal standard output will go into Log.std.out
+      - BAM_Unsorted           ... alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted
+      - BAM_SortedByCoordinate ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype BAM SortedByCoordinate
+      - BAM_Quant              ... alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM
+    example: Log
+  - name: --outReadsUnmapped
+    type: string
+    description: |-
+      output of unmapped and partially mapped (i.e. mapped only one mate of a paired end read) reads in separate file(s).
+
+      - None    ... no output
+      - Fastx   ... output in separate fasta/fastq files, Unmapped.out.mate1/2
+  - name: --outQSconversionAdd
+    type: integer
+    description: add this number to the quality score (e.g. to convert from Illumina
+      to Sanger, use -31)
+    example: 0
+  - name: --outMultimapperOrder
+    type: string
+    description: |-
+      order of multimapping alignments in the output files
+
+      - Old_2.4             ... quasi-random order used before 2.5.0
+      - Random              ... random order of alignments for each multi-mapper. Read mates (pairs) are always adjacent, all alignment for each read stay together. This option will become default in the future releases.
+    example: Old_2.4
+- name: 'Output: SAM and BAM'
+  arguments:
+  - name: --outSAMtype
+    type: string
+    description: |-
+      type of SAM/BAM output
+
+      1st word:
+      - BAM  ... output BAM without sorting
+      - SAM  ... output SAM without sorting
+      - None ... no SAM/BAM output
+      2nd, 3rd:
+      - Unsorted           ... standard unsorted
+      - SortedByCoordinate ... sorted by coordinate. This option will allocate extra memory for sorting which can be specified by --limitBAMsortRAM.
+    example: SAM
+    multiple: yes
+    multiple_sep: ;
+  - name: --outSAMmode
+    type: string
+    description: |-
+      mode of SAM output
+
+      - None ... no SAM output
+      - Full ... full SAM output
+      - NoQS ... full SAM but without quality scores
+    example: Full
+  - name: --outSAMstrandField
+    type: string
+    description: |-
+      Cufflinks-like strand field flag
+
+      - None        ... not used
+      - intronMotif ... strand derived from the intron motif. This option changes the output alignments: reads with inconsistent and/or non-canonical introns are filtered out.
+  - name: --outSAMattributes
+    type: string
+    description: |-
+      a string of desired SAM attributes, in the order desired for the output SAM. Tags can be listed in any combination/order.
+
+      ***Presets:
+      - None        ... no attributes
+      - Standard    ... NH HI AS nM
+      - All         ... NH HI AS nM NM MD jM jI MC ch
+      ***Alignment:
+      - NH          ... number of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard SAM tag.
+      - HI          ... multiple alignment index, starts with --outSAMattrIHstart (=1 by default). Standard SAM tag.
+      - AS          ... local alignment score, +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE reads, total score for two mates. Stadnard SAM tag.
+      - nM          ... number of mismatches. For PE reads, sum over two mates.
+      - NM          ... edit distance to the reference (number of mismatched + inserted + deleted bases) for each mate. Standard SAM tag.
+      - MD          ... string encoding mismatched and deleted reference bases (see standard SAM specifications). Standard SAM tag.
+      - jM          ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions database is used, and a junction is annotated, 20 is added to its motif value.
+      - jI          ... start and end of introns for all junctions (1-based).
+      - XS          ... alignment strand according to --outSAMstrandField.
+      - MC          ... mate's CIGAR string. Standard SAM tag.
+      - ch          ... marks all segment of all chimeric alingments for --chimOutType WithinBAM output.
+      - cN          ... number of bases clipped from the read ends: 5' and 3'
+      ***Variation:
+      - vA          ... variant allele
+      - vG          ... genomic coordinate of the variant overlapped by the read.
+      - vW          ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires --waspOutputMode SAMtag.
+      - ha          ... haplotype (1/2) when mapping to the diploid genome. Requires genome generated with --genomeTransformType Diploid .
+      ***STARsolo:
+      - CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs for the solo* demultiplexing.
+      - GX GN       ... gene ID and gene name for unique-gene reads.
+      - gx gn       ... gene IDs and gene names for unique- and multi-gene reads.
+      - CB UB       ... error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM SortedByCoordinate.
+      - sM          ... assessment of CB and UMI.
+      - sS          ... sequence of the entire barcode (CB,UMI,adapter).
+      - sQ          ... quality of the entire barcode.
+      - sF          ... type of feature overlap and number of features for each alignment
+      ***Unsupported/undocumented:
+      - rB          ... alignment block read/genomic coordinates.
+      - vR          ... read coordinate of the variant.
+    example: Standard
+    multiple: yes
+    multiple_sep: ;
+  - name: --outSAMattrIHstart
+    type: integer
+    description: start value for the IH attribute. 0 may be required by some downstream
+      software, such as Cufflinks or StringTie.
+    example: 1
+  - name: --outSAMunmapped
+    type: string
+    description: |-
+      output of unmapped reads in the SAM format
+
+      1st word:
+      - None   ... no output
+      - Within ... output unmapped reads within the main SAM file (i.e. Aligned.out.sam)
+      2nd word:
+      - KeepPairs ... record unmapped mate for each alignment, and, in case of unsorted output, keep it adjacent to its mapped mate. Only affects multi-mapping reads.
+    multiple: yes
+    multiple_sep: ;
+  - name: --outSAMorder
+    type: string
+    description: |-
+      type of sorting for the SAM output
+
+      Paired: one mate after the other for all paired alignments
+      PairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files
+    example: Paired
+  - name: --outSAMprimaryFlag
+    type: string
+    description: |-
+      which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG
+
+      - OneBestScore ... only one alignment with the best score is primary
+      - AllBestScore ... all alignments with the best score are primary
+    example: OneBestScore
+  - name: --outSAMreadID
+    type: string
+    description: |-
+      read ID record type
+
+      - Standard ... first word (until space) from the FASTx read ID line, removing /1,/2 from the end
+      - Number   ... read number (index) in the FASTx file
+    example: Standard
+  - name: --outSAMmapqUnique
+    type: integer
+    description: '0 to 255: the MAPQ value for unique mappers'
+    example: 255
+  - name: --outSAMflagOR
+    type: integer
+    description: '0 to 65535: sam FLAG will be bitwise OR''d with this value, i.e.
+      FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by STAR,
+      and after outSAMflagAND. Can be used to set specific bits that are not set otherwise.'
+    example: 0
+  - name: --outSAMflagAND
+    type: integer
+    description: '0 to 65535: sam FLAG will be bitwise AND''d with this value, i.e.
+      FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by STAR,
+      but before outSAMflagOR. Can be used to unset specific bits that are not set
+      otherwise.'
+    example: 65535
+  - name: --outSAMattrRGline
+    type: string
+    description: |-
+      SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:", e.g. --outSAMattrRGline ID:xxx CN:yy "DS:z z z".
+
+      xxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted.
+      Comma separated RG lines correspons to different (comma separated) input files in --readFilesIn. Commas have to be surrounded by spaces, e.g.
+      --outSAMattrRGline ID:xxx , ID:zzz "DS:z z" , ID:yyy DS:yyyy
+    multiple: yes
+    multiple_sep: ;
+  - name: --outSAMheaderHD
+    type: string
+    description: '@HD (header) line of the SAM header'
+    multiple: yes
+    multiple_sep: ;
+  - name: --outSAMheaderPG
+    type: string
+    description: extra @PG (software) line of the SAM header (in addition to STAR)
+    multiple: yes
+    multiple_sep: ;
+  - name: --outSAMheaderCommentFile
+    type: string
+    description: path to the file with @CO (comment) lines of the SAM header
+  - name: --outSAMfilter
+    type: string
+    description: |-
+      filter the output into main SAM/BAM files
+
+      - KeepOnlyAddedReferences ... only keep the reads for which all alignments are to the extra reference sequences added with --genomeFastaFiles at the mapping stage.
+      - KeepAllAddedReferences ...  keep all alignments to the extra reference sequences added with --genomeFastaFiles at the mapping stage.
+    multiple: yes
+    multiple_sep: ;
+  - name: --outSAMmultNmax
+    type: integer
+    description: |-
+      max number of multiple alignments for a read that will be output to the SAM/BAM files. Note that if this value is not equal to -1, the top scoring alignment will be output first
+
+      - -1 ... all alignments (up to --outFilterMultimapNmax) will be output
+    example: -1
+  - name: --outSAMtlen
+    type: integer
+    description: |-
+      calculation method for the TLEN field in the SAM/BAM files
+
+      - 1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate. (+)sign for the (+)strand mate
+      - 2 ... leftmost base of any mate to rightmost base of any mate. (+)sign for the mate with the leftmost base. This is different from 1 for overlapping mates with protruding ends
+    example: 1
+  - name: --outBAMcompression
+    type: integer
+    description: -1 to 10  BAM compression level, -1=default compression (6?), 0=no
+      compression, 10=maximum compression
+    example: 1
+  - name: --outBAMsortingThreadN
+    type: integer
+    description: '>=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN).'
+    example: 0
+  - name: --outBAMsortingBinsN
+    type: integer
+    description: '>0:  number of genome bins for coordinate-sorting'
+    example: 50
+- name: BAM processing
+  arguments:
+  - name: --bamRemoveDuplicatesType
+    type: string
+    description: |-
+      mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only
+
+      - -                       ... no duplicate removal/marking
+      - UniqueIdentical         ... mark all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical
+      - UniqueIdenticalNotMulti  ... mark duplicate unique mappers but not multimappers.
+  - name: --bamRemoveDuplicatesMate2basesN
+    type: integer
+    description: number of bases from the 5' of mate 2 to use in collapsing (e.g.
+      for RAMPAGE)
+    example: 0
+- name: Output Wiggle
+  arguments:
+  - name: --outWigType
+    type: string
+    description: |-
+      type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p". Requires sorted BAM: --outSAMtype BAM SortedByCoordinate .
+
+      1st word:
+      - None       ... no signal output
+      - bedGraph   ... bedGraph format
+      - wiggle     ... wiggle format
+      2nd word:
+      - read1_5p   ... signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc
+      - read2      ... signal from only 2nd read
+    multiple: yes
+    multiple_sep: ;
+  - name: --outWigStrand
+    type: string
+    description: |-
+      strandedness of wiggle/bedGraph output
+
+      - Stranded   ...  separate strands, str1 and str2
+      - Unstranded ...  collapsed strands
+    example: Stranded
+  - name: --outWigReferencesPrefix
+    type: string
+    description: prefix matching reference names to include in the output wiggle file,
+      e.g. "chr", default "-" - include all references
+  - name: --outWigNorm
+    type: string
+    description: |-
+      type of normalization for the signal
+
+      - RPM    ... reads per million of mapped reads
+      - None   ... no normalization, "raw" counts
+    example: RPM
+- name: Output Filtering
+  arguments:
+  - name: --outFilterType
+    type: string
+    description: |-
+      type of filtering
+
+      - Normal  ... standard filtering using only current alignment
+      - BySJout ... keep only those reads that contain junctions that passed filtering into SJ.out.tab
+    example: Normal
+  - name: --outFilterMultimapScoreRange
+    type: integer
+    description: the score range below the maximum score for multimapping alignments
+    example: 1
+  - name: --outFilterMultimapNmax
+    type: integer
+    description: |-
+      maximum number of loci the read is allowed to map to. Alignments (all of them) will be output only if the read maps to no more loci than this value.
+
+      Otherwise no alignments will be output, and the read will be counted as "mapped to too many loci" in the Log.final.out .
+    example: 10
+  - name: --outFilterMismatchNmax
+    type: integer
+    description: alignment will be output only if it has no more mismatches than this
+      value.
+    example: 10
+  - name: --outFilterMismatchNoverLmax
+    type: double
+    description: alignment will be output only if its ratio of mismatches to *mapped*
+      length is less than or equal to this value.
+    example: 0.3
+  - name: --outFilterMismatchNoverReadLmax
+    type: double
+    description: alignment will be output only if its ratio of mismatches to *read*
+      length is less than or equal to this value.
+    example: 1.0
+  - name: --outFilterScoreMin
+    type: integer
+    description: alignment will be output only if its score is higher than or equal
+      to this value.
+    example: 0
+  - name: --outFilterScoreMinOverLread
+    type: double
+    description: same as outFilterScoreMin, but normalized to read length (sum of
+      mates' lengths for paired-end reads)
+    example: 0.66
+  - name: --outFilterMatchNmin
+    type: integer
+    description: alignment will be output only if the number of matched bases is higher
+      than or equal to this value.
+    example: 0
+  - name: --outFilterMatchNminOverLread
+    type: double
+    description: sam as outFilterMatchNmin, but normalized to the read length (sum
+      of mates' lengths for paired-end reads).
+    example: 0.66
+  - name: --outFilterIntronMotifs
+    type: string
+    description: |-
+      filter alignment using their motifs
+
+      - None                           ... no filtering
+      - RemoveNoncanonical             ... filter out alignments that contain non-canonical junctions
+      - RemoveNoncanonicalUnannotated  ... filter out alignments that contain non-canonical unannotated junctions when using annotated splice junctions database. The annotated non-canonical junctions will be kept.
+  - name: --outFilterIntronStrands
+    type: string
+    description: |-
+      filter alignments
+
+      - RemoveInconsistentStrands      ... remove alignments that have junctions with inconsistent strands
+      - None                           ... no filtering
+    example: RemoveInconsistentStrands
+- name: Output splice junctions (SJ.out.tab)
+  arguments:
+  - name: --outSJtype
+    type: string
+    description: |-
+      type of splice junction output
+
+      - Standard    ... standard SJ.out.tab output
+      - None        ... no splice junction output
+    example: Standard
+- name: 'Output Filtering: Splice Junctions'
+  arguments:
+  - name: --outSJfilterReads
+    type: string
+    description: |-
+      which reads to consider for collapsed splice junctions output
+
+      - All     ... all reads, unique- and multi-mappers
+      - Unique  ... uniquely mapping reads only
+    example: All
+  - name: --outSJfilterOverhangMin
+    type: integer
+    description: |-
+      minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif
+
+      does not apply to annotated junctions
+    example:
+    - 30
+    - 12
+    - 12
+    - 12
+    multiple: yes
+    multiple_sep: ;
+  - name: --outSJfilterCountUniqueMin
+    type: integer
+    description: |-
+      minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif
+
+      Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied
+      does not apply to annotated junctions
+    example:
+    - 3
+    - 1
+    - 1
+    - 1
+    multiple: yes
+    multiple_sep: ;
+  - name: --outSJfilterCountTotalMin
+    type: integer
+    description: |-
+      minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif
+
+      Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied
+      does not apply to annotated junctions
+    example:
+    - 3
+    - 1
+    - 1
+    - 1
+    multiple: yes
+    multiple_sep: ;
+  - name: --outSJfilterDistToOtherSJmin
+    type: integer
+    description: |-
+      minimum allowed distance to other junctions' donor/acceptor
+
+      does not apply to annotated junctions
+    example:
+    - 10
+    - 0
+    - 5
+    - 10
+    multiple: yes
+    multiple_sep: ;
+  - name: --outSJfilterIntronMaxVsReadN
+    type: integer
+    description: |-
+      maximum gap allowed for junctions supported by 1,2,3,,,N reads
+
+      i.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax
+      does not apply to annotated junctions
+    example:
+    - 50000
+    - 100000
+    - 200000
+    multiple: yes
+    multiple_sep: ;
+- name: Scoring
+  arguments:
+  - name: --scoreGap
+    type: integer
+    description: splice junction penalty (independent on intron motif)
+    example: 0
+  - name: --scoreGapNoncan
+    type: integer
+    description: non-canonical junction penalty (in addition to scoreGap)
+    example: -8
+  - name: --scoreGapGCAG
+    type: integer
+    description: GC/AG and CT/GC junction penalty (in addition to scoreGap)
+    example: -4
+  - name: --scoreGapATAC
+    type: integer
+    description: AT/AC  and GT/AT junction penalty  (in addition to scoreGap)
+    example: -8
+  - name: --scoreGenomicLengthLog2scale
+    type: integer
+    description: 'extra score logarithmically scaled with genomic length of the alignment:
+      scoreGenomicLengthLog2scale*log2(genomicLength)'
+    example: 0
+  - name: --scoreDelOpen
+    type: integer
+    description: deletion open penalty
+    example: -2
+  - name: --scoreDelBase
+    type: integer
+    description: deletion extension penalty per base (in addition to scoreDelOpen)
+    example: -2
+  - name: --scoreInsOpen
+    type: integer
+    description: insertion open penalty
+    example: -2
+  - name: --scoreInsBase
+    type: integer
+    description: insertion extension penalty per base (in addition to scoreInsOpen)
+    example: -2
+  - name: --scoreStitchSJshift
+    type: integer
+    description: maximum score reduction while searching for SJ boundaries in the
+      stitching step
+    example: 1
+- name: Alignments and Seeding
+  arguments:
+  - name: --seedSearchStartLmax
+    type: integer
+    description: defines the search start point through the read - the read is split
+      into pieces no longer than this value
+    example: 50
+  - name: --seedSearchStartLmaxOverLread
+    type: double
+    description: seedSearchStartLmax normalized to read length (sum of mates' lengths
+      for paired-end reads)
+    example: 1.0
+  - name: --seedSearchLmax
+    type: integer
+    description: defines the maximum length of the seeds, if =0 seed length is not
+      limited
+    example: 0
+  - name: --seedMultimapNmax
+    type: integer
+    description: only pieces that map fewer than this value are utilized in the stitching
+      procedure
+    example: 10000
+  - name: --seedPerReadNmax
+    type: integer
+    description: max number of seeds per read
+    example: 1000
+  - name: --seedPerWindowNmax
+    type: integer
+    description: max number of seeds per window
+    example: 50
+  - name: --seedNoneLociPerWindow
+    type: integer
+    description: max number of one seed loci per window
+    example: 10
+  - name: --seedSplitMin
+    type: integer
+    description: min length of the seed sequences split by Ns or mate gap
+    example: 12
+  - name: --seedMapMin
+    type: integer
+    description: min length of seeds to be mapped
+    example: 5
+  - name: --alignIntronMin
+    type: integer
+    description: minimum intron size, genomic gap is considered intron if its length>=alignIntronMin,
+      otherwise it is considered Deletion
+    example: 21
+  - name: --alignIntronMax
+    type: integer
+    description: maximum intron size, if 0, max intron size will be determined by
+      (2^winBinNbits)*winAnchorDistNbins
+    example: 0
+  - name: --alignMatesGapMax
+    type: integer
+    description: maximum gap between two mates, if 0, max intron gap will be determined
+      by (2^winBinNbits)*winAnchorDistNbins
+    example: 0
+  - name: --alignSJoverhangMin
+    type: integer
+    description: minimum overhang (i.e. block size) for spliced alignments
+    example: 5
+  - name: --alignSJstitchMismatchNmax
+    type: integer
+    description: |-
+      maximum number of mismatches for stitching of the splice junctions (-1: no limit).
+
+      (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif.
+    example:
+    - 0
+    - -1
+    - 0
+    - 0
+    multiple: yes
+    multiple_sep: ;
+  - name: --alignSJDBoverhangMin
+    type: integer
+    description: minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments
+    example: 3
+  - name: --alignSplicedMateMapLmin
+    type: integer
+    description: minimum mapped length for a read mate that is spliced
+    example: 0
+  - name: --alignSplicedMateMapLminOverLmate
+    type: double
+    description: alignSplicedMateMapLmin normalized to mate length
+    example: 0.66
+  - name: --alignWindowsPerReadNmax
+    type: integer
+    description: max number of windows per read
+    example: 10000
+  - name: --alignTranscriptsPerWindowNmax
+    type: integer
+    description: max number of transcripts per window
+    example: 100
+  - name: --alignTranscriptsPerReadNmax
+    type: integer
+    description: max number of different alignments per read to consider
+    example: 10000
+  - name: --alignEndsType
+    type: string
+    description: |-
+      type of read ends alignment
+
+      - Local             ... standard local alignment with soft-clipping allowed
+      - EndToEnd          ... force end-to-end read alignment, do not soft-clip
+      - Extend5pOfRead1   ... fully extend only the 5p of the read1, all other ends: local alignment
+      - Extend5pOfReads12 ... fully extend only the 5p of the both read1 and read2, all other ends: local alignment
+    example: Local
+  - name: --alignEndsProtrude
+    type: string
+    description: |-
+      allow protrusion of alignment ends, i.e. start (end) of the +strand mate downstream of the start (end) of the -strand mate
+
+      1st word: int: maximum number of protrusion bases allowed
+      2nd word: string:
+      -                     ConcordantPair ... report alignments with non-zero protrusion as concordant pairs
+      -                     DiscordantPair ... report alignments with non-zero protrusion as discordant pairs
+    example: 0    ConcordantPair
+  - name: --alignSoftClipAtReferenceEnds
+    type: string
+    description: |-
+      allow the soft-clipping of the alignments past the end of the chromosomes
+
+      - Yes ... allow
+      - No  ... prohibit, useful for compatibility with Cufflinks
+    example: 'Yes'
+  - name: --alignInsertionFlush
+    type: string
+    description: |-
+      how to flush ambiguous insertion positions
+
+      - None    ... insertions are not flushed
+      - Right   ... insertions are flushed to the right
+- name: Paired-End reads
+  arguments:
+  - name: --peOverlapNbasesMin
+    type: integer
+    description: minimum number of overlapping bases to trigger mates merging and
+      realignment. Specify >0 value to switch on the "merginf of overlapping mates"
+      algorithm.
+    example: 0
+  - name: --peOverlapMMp
+    type: double
+    description: maximum proportion of mismatched bases in the overlap area
+    example: 0.01
+- name: Windows, Anchors, Binning
+  arguments:
+  - name: --winAnchorMultimapNmax
+    type: integer
+    description: max number of loci anchors are allowed to map to
+    example: 50
+  - name: --winBinNbits
+    type: integer
+    description: =log2(winBin), where winBin is the size of the bin for the windows/clustering,
+      each window will occupy an integer number of bins.
+    example: 16
+  - name: --winAnchorDistNbins
+    type: integer
+    description: max number of bins between two anchors that allows aggregation of
+      anchors into one window
+    example: 9
+  - name: --winFlankNbins
+    type: integer
+    description: log2(winFlank), where win Flank is the size of the left and right
+      flanking regions for each window
+    example: 4
+  - name: --winReadCoverageRelativeMin
+    type: double
+    description: minimum relative coverage of the read sequence by the seeds in a
+      window, for STARlong algorithm only.
+    example: 0.5
+  - name: --winReadCoverageBasesMin
+    type: integer
+    description: minimum number of bases covered by the seeds in a window , for STARlong
+      algorithm only.
+    example: 0
+- name: Chimeric Alignments
+  arguments:
+  - name: --chimOutType
+    type: string
+    description: |-
+      type of chimeric output
+
+      - Junctions       ... Chimeric.out.junction
+      - SeparateSAMold  ... output old SAM into separate Chimeric.out.sam file
+      - WithinBAM       ... output into main aligned BAM files (Aligned.*.bam)
+      - WithinBAM HardClip  ... (default) hard-clipping in the CIGAR for supplemental chimeric alignments (default if no 2nd word is present)
+      - WithinBAM SoftClip  ... soft-clipping in the CIGAR for supplemental chimeric alignments
+    example: Junctions
+    multiple: yes
+    multiple_sep: ;
+  - name: --chimSegmentMin
+    type: integer
+    description: minimum length of chimeric segment length, if ==0, no chimeric output
+    example: 0
+  - name: --chimScoreMin
+    type: integer
+    description: minimum total (summed) score of the chimeric segments
+    example: 0
+  - name: --chimScoreDropMax
+    type: integer
+    description: max drop (difference) of chimeric score (the sum of scores of all
+      chimeric segments) from the read length
+    example: 20
+  - name: --chimScoreSeparation
+    type: integer
+    description: minimum difference (separation) between the best chimeric score and
+      the next one
+    example: 10
+  - name: --chimScoreJunctionNonGTAG
+    type: integer
+    description: penalty for a non-GT/AG chimeric junction
+    example: -1
+  - name: --chimJunctionOverhangMin
+    type: integer
+    description: minimum overhang for a chimeric junction
+    example: 20
+  - name: --chimSegmentReadGapMax
+    type: integer
+    description: maximum gap in the read sequence between chimeric segments
+    example: 0
+  - name: --chimFilter
+    type: string
+    description: |-
+      different filters for chimeric alignments
+
+      - None ... no filtering
+      - banGenomicN ... Ns are not allowed in the genome sequence around the chimeric junction
+    example: banGenomicN
+    multiple: yes
+    multiple_sep: ;
+  - name: --chimMainSegmentMultNmax
+    type: integer
+    description: maximum number of multi-alignments for the main chimeric segment.
+      =1 will prohibit multimapping main segments.
+    example: 10
+  - name: --chimMultimapNmax
+    type: integer
+    description: |-
+      maximum number of chimeric multi-alignments
+
+      - 0 ... use the old scheme for chimeric detection which only considered unique alignments
+    example: 0
+  - name: --chimMultimapScoreRange
+    type: integer
+    description: the score range for multi-mapping chimeras below the best chimeric
+      score. Only works with --chimMultimapNmax > 1
+    example: 1
+  - name: --chimNonchimScoreDropMin
+    type: integer
+    description: to trigger chimeric detection, the drop in the best non-chimeric
+      alignment score with respect to the read length has to be greater than this
+      value
+    example: 20
+  - name: --chimOutJunctionFormat
+    type: integer
+    description: |-
+      formatting type for the Chimeric.out.junction file
+
+      - 0 ... no comment lines/headers
+      - 1 ... comment lines at the end of the file: command line and Nreads: total, unique/multi-mapping
+    example: 0
+- name: Quantification of Annotations
+  arguments:
+  - name: --quantMode
+    type: string
+    description: |-
+      types of quantification requested
+
+      - -                ... none
+      - TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate file
+      - GeneCounts       ... count reads per gene
+    multiple: yes
+    multiple_sep: ;
+  - name: --quantTranscriptomeBAMcompression
+    type: integer
+    description: |-
+      -2 to 10  transcriptome BAM compression level
+
+      - -2  ... no BAM output
+      - -1  ... default compression (6?)
+      -  0  ... no compression
+      -  10 ... maximum compression
+    example: 1
+  - name: --quantTranscriptomeSAMoutput
+    type: string
+    description: |-
+      alignment filtering for TranscriptomeSAM output
+
+      - BanSingleEnd_BanIndels_ExtendSoftclip ... prohibit indels and single-end alignments, extend softclips - compatible with RSEM
+      - BanSingleEnd               ... prohibit single-end alignments, allow indels and softclips
+      - BanSingleEnd_ExtendSoftclip ... prohibit single-end alignments, extend softclips, allow indels
+    example: BanSingleEnd_BanIndels_ExtendSoftclip
+- name: 2-pass Mapping
+  arguments:
+  - name: --twopassMode
+    type: string
+    description: |-
+      2-pass mapping mode.
+
+      - None        ... 1-pass mapping
+      - Basic       ... basic 2-pass mapping, with all 1st pass junctions inserted into the genome indices on the fly
+  - name: --twopass1readsN
+    type: integer
+    description: number of reads to process for the 1st step. Use very large number
+      (or default -1) to map all reads in the first step.
+    example: -1
+- name: WASP parameters
+  arguments:
+  - name: --waspOutputMode
+    type: string
+    description: |-
+      WASP allele-specific output type. This is re-implementation of the original WASP mappability filtering by Bryce van de Geijn, Graham McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12, 1061-1063 (2015), https://www.nature.com/articles/nmeth.3582 .
+
+      - SAMtag      ... add WASP tags to the alignments that pass WASP filtering
diff --git a/src/star/star_align_reads/config.vsh.yaml b/src/star/star_align_reads/config.vsh.yaml
new file mode 100644
index 00000000..8fdd5256
--- /dev/null
+++ b/src/star/star_align_reads/config.vsh.yaml
@@ -0,0 +1,115 @@
+name: star_align_reads
+namespace: star
+description: |
+  Aligns reads to a reference genome using STAR.
+keywords: [align, fasta, genome]
+links:
+  repository: https://github.com/alexdobin/STAR
+  documentation: https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf
+references:
+  doi: 10.1093/bioinformatics/bts635
+license: MIT
+requirements:
+  commands: [ STAR, python, ps, zcat, bzcat ]
+# manually taking care of the main input and output arguments
+argument_groups:
+  - name: Inputs
+    arguments:
+      - type: file
+        name: --input
+        alternatives: --readFilesIn
+        required: true
+        description: The single-end or paired-end R1 FastQ files to be processed.
+        example: [ mysample_S1_L001_R1_001.fastq.gz ]
+        multiple: true
+      - type: file
+        name: --input_r2
+        required: false
+        description: The paired-end R2 FastQ files to be processed. Only required if --input is a paired-end R1 file.
+        example: [ mysample_S1_L001_R2_001.fastq.gz ]
+        multiple: true
+  - name: Outputs
+    arguments:
+      - type: file
+        name: --aligned_reads
+        required: true
+        description: The output file containing the aligned reads.
+        direction: output
+        example: aligned_reads.bam
+      - type: file
+        name: --reads_per_gene
+        required: false
+        description: The output file containing the number of reads per gene.
+        direction: output
+        example: reads_per_gene.tsv
+      - type: file
+        name: --unmapped
+        required: false
+        description: The output file containing the unmapped reads.
+        direction: output
+        example: unmapped.fastq
+      - type: file
+        name: --unmapped_r2
+        required: false
+        description: The output file containing the unmapped R2 reads.
+        direction: output
+        example: unmapped_r2.fastq
+      - type: file
+        name: --chimeric_junctions
+        required: false
+        description: The output file containing the chimeric junctions.
+        direction: output
+        example: chimeric_junctions.tsv
+      - type: file
+        name: --log
+        required: false
+        description: The output file containing the log of the alignment process.
+        direction: output
+        example: log.txt
+      - type: file
+        name: --splice_junctions
+        required: false
+        description: The output file containing the splice junctions.
+        direction: output
+        example: splice_junctions.tsv
+# other arguments are defined in a separate file
+__merge__: argument_groups.yaml
+resources:
+  - type: python_script
+    path: script.py
+test_resources:
+  - type: bash_script
+    path: test.sh
+engines:
+  - type: docker
+    image: python:3.12-slim
+    setup:
+      - type: apt
+        packages:
+          - procps
+          - gzip
+          - bzip2
+      # setup derived from https://github.com/alexdobin/STAR/blob/master/extras/docker/Dockerfile
+      - type: docker
+        env: 
+          - STAR_VERSION 2.7.11b
+          - PACKAGES gcc g++ make wget zlib1g-dev unzip xxd
+        run: |
+          apt-get update && \
+            apt-get install -y --no-install-recommends ${PACKAGES} && \
+            cd /tmp && \
+            wget --no-check-certificate https://github.com/alexdobin/STAR/archive/refs/tags/${STAR_VERSION}.zip && \
+            unzip ${STAR_VERSION}.zip && \
+            cd STAR-${STAR_VERSION}/source && \
+            make STARstatic CXXFLAGS_SIMD=-std=c++11 && \
+            cp STAR /usr/local/bin && \
+            cd / && \
+            rm -rf /tmp/STAR-${STAR_VERSION} /tmp/${STAR_VERSION}.zip && \
+            apt-get --purge autoremove -y ${PACKAGES} && \
+            apt-get clean
+      - type: docker
+        run: |
+          STAR --version | sed 's#\(.*\)#star: "\1"#' > /var/software_versions.txt
+runners:
+  - type: executable
+  - type: nextflow
diff --git a/src/star/star_align_reads/help.txt b/src/star/star_align_reads/help.txt
new file mode 100644
index 00000000..940f639d
--- /dev/null
+++ b/src/star/star_align_reads/help.txt
@@ -0,0 +1,927 @@
+Usage: STAR  [options]... --genomeDir /path/to/genome/index/   --readFilesIn R1.fq R2.fq
+Spliced Transcripts Alignment to a Reference (c) Alexander Dobin, 2009-2022
+
+STAR version=2.7.11b
+STAR compilation time,server,dir=2024-02-11T19:36:26+00:00 :/tmp/STAR-2.7.11b/source
+For more details see:
+<https://github.com/alexdobin/STAR>
+<https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf>
+### versions
+versionGenome           2.7.4a
+    string: earliest genome index version compatible with this STAR release. Please do not change this value!
+
+### Parameter Files
+parametersFiles          -
+    string: name of a user-defined parameters file, "-": none. Can only be defined on the command line.
+
+### System
+sysShell            -
+    string: path to the shell binary, preferably bash, e.g. /bin/bash.
+                    - ... the default shell is executed, typically /bin/sh. This was reported to fail on some Ubuntu systems - then you need to specify path to bash.
+
+### Run Parameters
+runMode                         alignReads
+    string: type of the run.
+                                alignReads             ... map reads
+                                genomeGenerate         ... generate genome files
+                                inputAlignmentsFromBAM ... input alignments from BAM. Presently only works with --outWigType and --bamRemoveDuplicates options.
+                                liftOver               ... lift-over of GTF files (--sjdbGTFfile) between genome assemblies using chain file(s) from --genomeChainFiles.
+                                soloCellFiltering  </path/to/raw/count/dir/>   </path/to/output/prefix>    ... STARsolo cell filtering ("calling") without remapping, followed by the path to raw count directory and output (filtered) prefix
+
+runThreadN                      1
+    int: number of threads to run STAR
+
+runDirPerm                      User_RWX
+    string: permissions for the directories created at the run-time.
+                                User_RWX ... user-read/write/execute
+                                All_RWX  ... all-read/write/execute (same as chmod 777)
+
+runRNGseed                      777
+    int: random number generator seed.
+
+
+### Genome Parameters
+genomeDir                   ./GenomeDir/
+    string: path to the directory where genome files are stored (for --runMode alignReads) or will be generated (for --runMode generateGenome)
+
+genomeLoad                NoSharedMemory
+    string: mode of shared memory usage for the genome files. Only used with --runMode alignReads.
+                          LoadAndKeep     ... load genome into shared and keep it in memory after run
+                          LoadAndRemove   ... load genome into shared but remove it after run
+                          LoadAndExit     ... load genome into shared memory and exit, keeping the genome in memory for future runs
+                          Remove          ... do not map anything, just remove loaded genome from memory
+                          NoSharedMemory  ... do not use shared memory, each job will have its own private copy of the genome
+
+genomeFastaFiles            -
+    string(s): path(s) to the fasta files with the genome sequences, separated by spaces. These files should be plain text FASTA files, they *cannot* be zipped.
+                            Required for the genome generation (--runMode genomeGenerate). Can also be used in the mapping (--runMode alignReads) to add extra (new) sequences to the genome (e.g. spike-ins).
+
+genomeChainFiles            -
+    string: chain files for genomic liftover. Only used with --runMode liftOver .
+
+genomeFileSizes             0
+    uint(s)>0: genome files exact sizes in bytes. Typically, this should not be defined by the user.
+    
+genomeTransformOutput       None
+    string(s):              which output to transform back to original genome
+                            SAM     ... SAM/BAM alignments
+                            SJ      ... splice junctions (SJ.out.tab)
+                            Quant   ... quantifications (from --quantMode option)
+                            None    ... no transformation of the output        
+
+genomeChrSetMitochondrial   chrM M MT
+    string(s):              names of the mitochondrial chromosomes. Presently only used for STARsolo statistics output/
+
+### Genome Indexing Parameters - only used with --runMode genomeGenerate
+genomeChrBinNbits           18
+    int: =log2(chrBin), where chrBin is the size of the bins for genome storage: each chromosome will occupy an integer number of bins. For a genome with large number of contigs, it is recommended to scale this parameter as min(18, log2[max(GenomeLength/NumberOfReferences,ReadLength)]).
+
+genomeSAindexNbases         14
+    int: length (bases) of the SA pre-indexing string. Typically between 10 and 15. Longer strings will use much more memory, but allow faster searches. For small genomes, the parameter --genomeSAindexNbases must be scaled down to min(14, log2(GenomeLength)/2 - 1).
+
+genomeSAsparseD             1
+    int>0: suffux array sparsity, i.e. distance between indices: use bigger numbers to decrease needed RAM at the cost of mapping speed reduction
+
+genomeSuffixLengthMax       -1
+    int: maximum length of the suffixes, has to be longer than read length. -1 = infinite.
+    
+genomeTransformType         None
+    string: type of genome transformation
+                            None       ... no transformation
+                            Haploid    ... replace reference alleles with alternative alleles from VCF file (e.g. consensus allele)
+                            Diploid    ... create two haplotypes for each chromosome listed in VCF file, for genotypes 1|2, assumes perfect phasing (e.g. personal genome)
+
+genomeTransformVCF          -
+    string: path to VCF file for genome transformation
+
+
+    
+#####UnderDevelopment_begin : not supported - do not use
+genomeType                  Full
+    string: type of genome to generate
+                            Full                ... full (normal) genome
+                            Transcriptome       ... genome consists of transcript sequences
+                            SuperTransriptome   ... genome consists of superTranscript sequences
+#####UnderDevelopment_end
+
+# DEPRECATED: please use --genomeTransformVCF and --genomeTransformType options instead.
+#genomeConsensusFile         -
+#    string: VCF file with consensus SNPs (i.e. alternative allele is the major (AF>0.5) allele)
+# DEPRECATED 
+
+
+
+### Splice Junctions Database
+sjdbFileChrStartEnd                     -
+    string(s): path to the files with genomic coordinates (chr <tab> start <tab> end <tab> strand) for the splice junction introns. Multiple files can be supplied and will be concatenated.
+
+sjdbGTFfile                             -
+    string: path to the GTF file with annotations
+
+sjdbGTFchrPrefix                        -
+    string: prefix for chromosome names in a GTF file (e.g. 'chr' for using ENSMEBL annotations with UCSC genomes)
+
+sjdbGTFfeatureExon                      exon
+    string: feature type in GTF file to be used as exons for building transcripts
+
+sjdbGTFtagExonParentTranscript          transcript_id
+    string: GTF attribute name for parent transcript ID (default "transcript_id" works for GTF files)
+
+sjdbGTFtagExonParentGene                gene_id
+    string: GTF attribute name for parent gene ID (default "gene_id" works for GTF files)
+
+sjdbGTFtagExonParentGeneName            gene_name
+    string(s): GTF attribute name for parent gene name
+
+sjdbGTFtagExonParentGeneType            gene_type gene_biotype
+    string(s): GTF attribute name for parent gene type
+
+sjdbOverhang                            100
+    int>0: length of the donor/acceptor sequence on each side of the junctions, ideally = (mate_length - 1)
+
+sjdbScore                               2
+    int: extra alignment score for alignments that cross database junctions
+
+sjdbInsertSave                          Basic
+    string: which files to save when sjdb junctions are inserted on the fly at the mapping step
+                    Basic ... only small junction / transcript files
+                    All   ... all files including big Genome, SA and SAindex - this will create a complete genome directory
+
+### Variation parameters
+varVCFfile                              -
+    string: path to the VCF file that contains variation data. The 10th column should contain the genotype information, e.g. 0/1
+
+### Input Files
+inputBAMfile                -
+    string: path to BAM input file, to be used with --runMode inputAlignmentsFromBAM
+
+### Read Parameters
+readFilesType               Fastx
+    string: format of input read files
+                            Fastx       ... FASTA or FASTQ
+                            SAM SE      ... SAM or BAM single-end reads; for BAM use --readFilesCommand samtools view
+                            SAM PE      ... SAM or BAM paired-end reads; for BAM use --readFilesCommand samtools view
+                            
+readFilesSAMattrKeep        All
+    string(s): for --readFilesType SAM SE/PE, which SAM tags to keep in the output BAM, e.g.: --readFilesSAMtagsKeep RG PL
+                            All     ... keep all tags
+                            None    ... do not keep any tags
+
+readFilesIn                 Read1 Read2
+    string(s): paths to files that contain input read1 (and, if needed,  read2)
+
+readFilesManifest           -
+    string: path to the "manifest" file with the names of read files. The manifest file should contain 3 tab-separated columns:
+            paired-end reads: read1_file_name $tab$ read2_file_name $tab$ read_group_line.
+            single-end reads: read1_file_name $tab$ -               $tab$ read_group_line.
+            Spaces, but not tabs are allowed in file names.
+            If read_group_line does not start with ID:, it can only contain one ID field, and ID: will be added to it.
+            If read_group_line starts with ID:, it can contain several fields separated by $tab$, and all fields will be be copied verbatim into SAM @RG header line.
+
+readFilesPrefix             -
+    string: prefix for the read files names, i.e. it will be added in front of the strings in --readFilesIn
+
+readFilesCommand             -
+    string(s): command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout
+               For example: zcat - to uncompress .gz files, bzcat - to uncompress .bz2 files, etc.
+
+readMapNumber               -1
+    int: number of reads to map from the beginning of the file
+                            -1: map all reads
+
+readMatesLengthsIn          NotEqual
+    string: Equal/NotEqual - lengths of names,sequences,qualities for both mates are the same  / not the same. NotEqual is safe in all situations.
+
+readNameSeparator           /
+    string(s): character(s) separating the part of the read names that will be trimmed in output (read name after space is always trimmed)
+
+readQualityScoreBase        33
+    int>=0: number to be subtracted from the ASCII code to get Phred quality score
+
+### Read Clipping
+
+clipAdapterType             Hamming
+    string:                 adapter clipping type
+                            Hamming ... adapter clipping based on Hamming distance, with the number of mismatches controlled by --clip5pAdapterMMp
+                            CellRanger4 ... 5p and 3p adapter clipping similar to CellRanger4. Utilizes Opal package by Martin Šošić: https://github.com/Martinsos/opal
+                            None ... no adapter clipping, all other clip* parameters are disregarded
+                            
+clip3pNbases                 0
+    int(s): number(s) of bases to clip from 3p of each mate. If one value is given, it will be assumed the same for both mates.
+
+clip3pAdapterSeq            -
+    string(s): adapter sequences to clip from 3p of each mate.  If one value is given, it will be assumed the same for both mates.
+                            polyA ... polyA sequence with the length equal to read length
+
+clip3pAdapterMMp            0.1
+    double(s): max proportion of mismatches for 3p adapter clipping for each mate.  If one value is given, it will be assumed the same for both mates.
+
+clip3pAfterAdapterNbases    0
+    int(s): number of bases to clip from 3p of each mate after the adapter clipping. If one value is given, it will be assumed the same for both mates.
+
+clip5pNbases                 0
+    int(s): number(s) of bases to clip from 5p of each mate. If one value is given, it will be assumed the same for both mates.
+
+#####UnderDevelopment_begin : not supported - do not use   
+clip5pAdapterSeq            -
+    string(s): adapter sequences to clip from 5p of each mate, separated by space.
+
+clip5pAdapterMMp            0.1
+    double(s): max proportion of mismatches for 5p adapter clipping for each mate, separated by space
+
+clip5pAfterAdapterNbases    0
+    int(s): number of bases to clip from 5p of each mate after the adapter clipping, separated by space.
+#####UnderDevelopment_end
+
+### Limits
+limitGenomeGenerateRAM               31000000000
+    int>0: maximum available RAM (bytes) for genome generation
+
+limitIObufferSize                    30000000 50000000
+    int(s)>0: max available buffers size (bytes) for input/output, per thread
+
+limitOutSAMoneReadBytes              100000
+    int>0: max size of the SAM record (bytes) for one read. Recommended value: >(2*(LengthMate1+LengthMate2+100)*outFilterMultimapNmax
+
+limitOutSJoneRead                    1000
+    int>0: max number of junctions for one read (including all multi-mappers)
+
+limitOutSJcollapsed                  1000000
+    int>0: max number of collapsed junctions
+
+limitBAMsortRAM                         0
+    int>=0: maximum available RAM (bytes) for sorting BAM. If =0, it will be set to the genome index size. 0 value can only be used with --genomeLoad NoSharedMemory option.
+
+limitSjdbInsertNsj                     1000000
+    int>=0: maximum number of junctions to be inserted to the genome on the fly at the mapping stage, including those from annotations and those detected in the 1st step of the 2-pass run
+
+limitNreadsSoft                        -1
+    int: soft limit on the number of reads
+
+### Output: general
+outFileNamePrefix               ./
+    string: output files name prefix (including full or relative path). Can only be defined on the command line.
+
+outTmpDir                       -
+    string: path to a directory that will be used as temporary by STAR. All contents of this directory will be removed!
+                                - ... the temp directory will default to outFileNamePrefix_STARtmp
+
+outTmpKeep                      None
+    string: whether to keep the temporary files after STAR runs is finished
+                                None ... remove all temporary files
+                                All ... keep all files
+
+outStd                          Log
+    string: which output will be directed to stdout (standard out)
+                                Log                    ... log messages
+                                SAM                    ... alignments in SAM format (which normally are output to Aligned.out.sam file), normal standard output will go into Log.std.out
+                                BAM_Unsorted           ... alignments in BAM format, unsorted. Requires --outSAMtype BAM Unsorted
+                                BAM_SortedByCoordinate ... alignments in BAM format, sorted by coordinate. Requires --outSAMtype BAM SortedByCoordinate
+                                BAM_Quant              ... alignments to transcriptome in BAM format, unsorted. Requires --quantMode TranscriptomeSAM
+
+outReadsUnmapped                None
+   string: output of unmapped and partially mapped (i.e. mapped only one mate of a paired end read) reads in separate file(s).
+                                None    ... no output
+                                Fastx   ... output in separate fasta/fastq files, Unmapped.out.mate1/2
+
+outQSconversionAdd              0
+   int: add this number to the quality score (e.g. to convert from Illumina to Sanger, use -31)
+
+outMultimapperOrder             Old_2.4
+    string: order of multimapping alignments in the output files
+                                Old_2.4             ... quasi-random order used before 2.5.0
+                                Random              ... random order of alignments for each multi-mapper. Read mates (pairs) are always adjacent, all alignment for each read stay together. This option will become default in the future releases.
+
+### Output: SAM and BAM
+outSAMtype                      SAM
+    strings: type of SAM/BAM output
+                                1st word:
+                                BAM  ... output BAM without sorting
+                                SAM  ... output SAM without sorting
+                                None ... no SAM/BAM output
+                                2nd, 3rd:
+                                Unsorted           ... standard unsorted
+                                SortedByCoordinate ... sorted by coordinate. This option will allocate extra memory for sorting which can be specified by --limitBAMsortRAM.
+
+outSAMmode                      Full
+    string: mode of SAM output
+                                None ... no SAM output
+                                Full ... full SAM output
+                                NoQS ... full SAM but without quality scores
+
+outSAMstrandField               None
+    string: Cufflinks-like strand field flag
+                                None        ... not used
+                                intronMotif ... strand derived from the intron motif. This option changes the output alignments: reads with inconsistent and/or non-canonical introns are filtered out.
+
+outSAMattributes                Standard
+    string(s): a string of desired SAM attributes, in the order desired for the output SAM. Tags can be listed in any combination/order.
+                                ***Presets:
+                                None        ... no attributes
+                                Standard    ... NH HI AS nM
+                                All         ... NH HI AS nM NM MD jM jI MC ch                                                                    
+                                ***Alignment:
+                                NH          ... number of loci the reads maps to: =1 for unique mappers, >1 for multimappers. Standard SAM tag.
+                                HI          ... multiple alignment index, starts with --outSAMattrIHstart (=1 by default). Standard SAM tag.
+                                AS          ... local alignment score, +1/-1 for matches/mismateches, score* penalties for indels and gaps. For PE reads, total score for two mates. Stadnard SAM tag.
+                                nM          ... number of mismatches. For PE reads, sum over two mates.
+                                NM          ... edit distance to the reference (number of mismatched + inserted + deleted bases) for each mate. Standard SAM tag.
+                                MD          ... string encoding mismatched and deleted reference bases (see standard SAM specifications). Standard SAM tag.
+                                jM          ... intron motifs for all junctions (i.e. N in CIGAR): 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT. If splice junctions database is used, and a junction is annotated, 20 is added to its motif value.
+                                jI          ... start and end of introns for all junctions (1-based).
+                                XS          ... alignment strand according to --outSAMstrandField.
+                                MC          ... mate's CIGAR string. Standard SAM tag.
+                                ch          ... marks all segment of all chimeric alingments for --chimOutType WithinBAM output.
+                                cN          ... number of bases clipped from the read ends: 5' and 3'
+                                ***Variation:
+                                vA          ... variant allele
+                                vG          ... genomic coordinate of the variant overlapped by the read.
+                                vW          ... 1 - alignment passes WASP filtering; 2,3,4,5,6,7 - alignment does not pass WASP filtering. Requires --waspOutputMode SAMtag.
+                                ha          ... haplotype (1/2) when mapping to the diploid genome. Requires genome generated with --genomeTransformType Diploid .                               
+                                ***STARsolo:
+                                CR CY UR UY ... sequences and quality scores of cell barcodes and UMIs for the solo* demultiplexing.
+                                GX GN       ... gene ID and gene name for unique-gene reads.
+                                gx gn       ... gene IDs and gene names for unique- and multi-gene reads.
+                                CB UB       ... error-corrected cell barcodes and UMIs for solo* demultiplexing. Requires --outSAMtype BAM SortedByCoordinate.
+                                sM          ... assessment of CB and UMI.
+                                sS          ... sequence of the entire barcode (CB,UMI,adapter).
+                                sQ          ... quality of the entire barcode.
+                                sF          ... type of feature overlap and number of features for each alignment
+                                ***Unsupported/undocumented:
+                                rB          ... alignment block read/genomic coordinates.
+                                vR          ... read coordinate of the variant.
+
+outSAMattrIHstart               1
+    int>=0:                     start value for the IH attribute. 0 may be required by some downstream software, such as Cufflinks or StringTie.
+
+outSAMunmapped                  None
+    string(s): output of unmapped reads in the SAM format
+                                1st word:
+                                None   ... no output
+                                Within ... output unmapped reads within the main SAM file (i.e. Aligned.out.sam)
+                                2nd word:
+                                KeepPairs ... record unmapped mate for each alignment, and, in case of unsorted output, keep it adjacent to its mapped mate. Only affects multi-mapping reads.
+
+outSAMorder                     Paired
+    string: type of sorting for the SAM output
+                                Paired: one mate after the other for all paired alignments
+                                PairedKeepInputOrder: one mate after the other for all paired alignments, the order is kept the same as in the input FASTQ files
+
+outSAMprimaryFlag        OneBestScore
+    string: which alignments are considered primary - all others will be marked with 0x100 bit in the FLAG
+                                OneBestScore ... only one alignment with the best score is primary
+                                AllBestScore ... all alignments with the best score are primary
+
+outSAMreadID            Standard
+    string: read ID record type
+                                Standard ... first word (until space) from the FASTx read ID line, removing /1,/2 from the end
+                                Number   ... read number (index) in the FASTx file
+
+outSAMmapqUnique        255
+    int: 0 to 255: the MAPQ value for unique mappers
+
+outSAMflagOR           0
+    int: 0 to 65535: sam FLAG will be bitwise OR'd with this value, i.e. FLAG=FLAG | outSAMflagOR. This is applied after all flags have been set by STAR, and after outSAMflagAND. Can be used to set specific bits that are not set otherwise.
+
+outSAMflagAND           65535
+    int: 0 to 65535: sam FLAG will be bitwise AND'd with this value, i.e. FLAG=FLAG & outSAMflagOR. This is applied after all flags have been set by STAR, but before outSAMflagOR. Can be used to unset specific bits that are not set otherwise.
+
+outSAMattrRGline        -
+    string(s): SAM/BAM read group line. The first word contains the read group identifier and must start with "ID:", e.g. --outSAMattrRGline ID:xxx CN:yy "DS:z z z".
+            xxx will be added as RG tag to each output alignment. Any spaces in the tag values have to be double quoted.
+            Comma separated RG lines correspons to different (comma separated) input files in --readFilesIn. Commas have to be surrounded by spaces, e.g.
+            --outSAMattrRGline ID:xxx , ID:zzz "DS:z z" , ID:yyy DS:yyyy
+
+outSAMheaderHD          -
+    strings: @HD (header) line of the SAM header
+
+outSAMheaderPG          -
+    strings: extra @PG (software) line of the SAM header (in addition to STAR)
+
+outSAMheaderCommentFile -
+    string: path to the file with @CO (comment) lines of the SAM header
+
+outSAMfilter            None
+    string(s): filter the output into main SAM/BAM files
+                        KeepOnlyAddedReferences ... only keep the reads for which all alignments are to the extra reference sequences added with --genomeFastaFiles at the mapping stage.
+                        KeepAllAddedReferences ...  keep all alignments to the extra reference sequences added with --genomeFastaFiles at the mapping stage.
+
+
+outSAMmultNmax          -1
+    int: max number of multiple alignments for a read that will be output to the SAM/BAM files. Note that if this value is not equal to -1, the top scoring alignment will be output first
+                        -1 ... all alignments (up to --outFilterMultimapNmax) will be output
+
+outSAMtlen              1
+    int: calculation method for the TLEN field in the SAM/BAM files
+                        1 ... leftmost base of the (+)strand mate to rightmost base of the (-)mate. (+)sign for the (+)strand mate
+                        2 ... leftmost base of any mate to rightmost base of any mate. (+)sign for the mate with the leftmost base. This is different from 1 for overlapping mates with protruding ends
+
+outBAMcompression       1
+    int: -1 to 10  BAM compression level, -1=default compression (6?), 0=no compression, 10=maximum compression
+
+outBAMsortingThreadN    0
+    int: >=0: number of threads for BAM sorting. 0 will default to min(6,--runThreadN).
+
+outBAMsortingBinsN      50
+    int: >0:  number of genome bins for coordinate-sorting
+
+### BAM processing
+bamRemoveDuplicatesType  -
+    string: mark duplicates in the BAM file, for now only works with (i) sorted BAM fed with inputBAMfile, and (ii) for paired-end alignments only
+                        -                       ... no duplicate removal/marking
+                        UniqueIdentical         ... mark all multimappers, and duplicate unique mappers. The coordinates, FLAG, CIGAR must be identical
+                        UniqueIdenticalNotMulti  ... mark duplicate unique mappers but not multimappers.
+
+bamRemoveDuplicatesMate2basesN   0
+    int>0: number of bases from the 5' of mate 2 to use in collapsing (e.g. for RAMPAGE)
+
+### Output Wiggle
+outWigType          None
+    string(s): type of signal output, e.g. "bedGraph" OR "bedGraph read1_5p". Requires sorted BAM: --outSAMtype BAM SortedByCoordinate .
+                    1st word:
+                    None       ... no signal output
+                    bedGraph   ... bedGraph format
+                    wiggle     ... wiggle format
+                    2nd word:
+                    read1_5p   ... signal from only 5' of the 1st read, useful for CAGE/RAMPAGE etc
+                    read2      ... signal from only 2nd read
+
+outWigStrand        Stranded
+    string: strandedness of wiggle/bedGraph output
+                    Stranded   ...  separate strands, str1 and str2
+                    Unstranded ...  collapsed strands
+
+outWigReferencesPrefix    -
+    string: prefix matching reference names to include in the output wiggle file, e.g. "chr", default "-" - include all references
+
+outWigNorm              RPM
+    string: type of normalization for the signal
+                        RPM    ... reads per million of mapped reads
+                        None   ... no normalization, "raw" counts
+
+### Output Filtering
+outFilterType                   Normal
+    string: type of filtering
+                                Normal  ... standard filtering using only current alignment
+                                BySJout ... keep only those reads that contain junctions that passed filtering into SJ.out.tab
+
+outFilterMultimapScoreRange     1
+    int: the score range below the maximum score for multimapping alignments
+
+outFilterMultimapNmax           10
+    int: maximum number of loci the read is allowed to map to. Alignments (all of them) will be output only if the read maps to no more loci than this value.
+         Otherwise no alignments will be output, and the read will be counted as "mapped to too many loci" in the Log.final.out .
+
+outFilterMismatchNmax           10
+    int: alignment will be output only if it has no more mismatches than this value.
+
+outFilterMismatchNoverLmax      0.3
+    real: alignment will be output only if its ratio of mismatches to *mapped* length is less than or equal to this value.
+
+outFilterMismatchNoverReadLmax  1.0
+    real: alignment will be output only if its ratio of mismatches to *read* length is less than or equal to this value.
+
+
+outFilterScoreMin               0
+    int: alignment will be output only if its score is higher than or equal to this value.
+
+outFilterScoreMinOverLread      0.66
+    real: same as outFilterScoreMin, but normalized to read length (sum of mates' lengths for paired-end reads)
+
+outFilterMatchNmin              0
+    int: alignment will be output only if the number of matched bases is higher than or equal to this value.
+
+outFilterMatchNminOverLread     0.66
+    real: sam as outFilterMatchNmin, but normalized to the read length (sum of mates' lengths for paired-end reads).
+
+outFilterIntronMotifs           None
+    string: filter alignment using their motifs
+                None                           ... no filtering
+                RemoveNoncanonical             ... filter out alignments that contain non-canonical junctions
+                RemoveNoncanonicalUnannotated  ... filter out alignments that contain non-canonical unannotated junctions when using annotated splice junctions database. The annotated non-canonical junctions will be kept.
+
+outFilterIntronStrands          RemoveInconsistentStrands
+    string: filter alignments
+                RemoveInconsistentStrands      ... remove alignments that have junctions with inconsistent strands
+                None                           ... no filtering
+
+### Output splice junctions (SJ.out.tab)
+outSJtype                       Standard
+    string: type of splice junction output
+                                Standard    ... standard SJ.out.tab output
+                                None        ... no splice junction output
+
+### Output Filtering: Splice Junctions
+outSJfilterReads                All
+    string: which reads to consider for collapsed splice junctions output
+                                All     ... all reads, unique- and multi-mappers
+                                Unique  ... uniquely mapping reads only
+
+outSJfilterOverhangMin          30  12  12  12
+    4 integers:    minimum overhang length for splice junctions on both sides for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif
+                                does not apply to annotated junctions
+
+outSJfilterCountUniqueMin       3   1   1   1
+    4 integers: minimum uniquely mapping read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif
+                                Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied
+                                does not apply to annotated junctions
+
+outSJfilterCountTotalMin     3   1   1   1
+    4 integers: minimum total (multi-mapping+unique) read count per junction for: (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif. -1 means no output for that motif
+                                Junctions are output if one of outSJfilterCountUniqueMin OR outSJfilterCountTotalMin conditions are satisfied
+                                does not apply to annotated junctions
+
+outSJfilterDistToOtherSJmin     10  0   5   10
+    4 integers>=0: minimum allowed distance to other junctions' donor/acceptor
+                                does not apply to annotated junctions
+
+outSJfilterIntronMaxVsReadN        50000 100000 200000
+    N integers>=0: maximum gap allowed for junctions supported by 1,2,3,,,N reads
+                                i.e. by default junctions supported by 1 read can have gaps <=50000b, by 2 reads: <=100000b, by 3 reads: <=200000. by >=4 reads any gap <=alignIntronMax
+                                does not apply to annotated junctions
+
+### Scoring
+scoreGap                     0
+    int: splice junction penalty (independent on intron motif)
+
+scoreGapNoncan               -8
+    int: non-canonical junction penalty (in addition to scoreGap)
+
+scoreGapGCAG                 -4
+    int: GC/AG and CT/GC junction penalty (in addition to scoreGap)
+
+scoreGapATAC                 -8
+    int: AT/AC  and GT/AT junction penalty  (in addition to scoreGap)
+
+scoreGenomicLengthLog2scale   -0.25
+    int: extra score logarithmically scaled with genomic length of the alignment: scoreGenomicLengthLog2scale*log2(genomicLength)
+
+scoreDelOpen                 -2
+    int: deletion open penalty
+
+scoreDelBase                 -2
+    int: deletion extension penalty per base (in addition to scoreDelOpen)
+
+scoreInsOpen                 -2
+    int: insertion open penalty
+
+scoreInsBase                 -2
+    int: insertion extension penalty per base (in addition to scoreInsOpen)
+
+scoreStitchSJshift           1
+    int: maximum score reduction while searching for SJ boundaries in the stitching step
+
+
+### Alignments and Seeding
+
+seedSearchStartLmax             50
+    int>0: defines the search start point through the read - the read is split into pieces no longer than this value
+
+seedSearchStartLmaxOverLread    1.0
+    real: seedSearchStartLmax normalized to read length (sum of mates' lengths for paired-end reads)
+
+seedSearchLmax       0
+    int>=0: defines the maximum length of the seeds, if =0 seed length is not limited
+
+seedMultimapNmax      10000
+    int>0: only pieces that map fewer than this value are utilized in the stitching procedure
+
+seedPerReadNmax       1000
+    int>0: max number of seeds per read
+
+seedPerWindowNmax     50
+    int>0: max number of seeds per window
+
+seedNoneLociPerWindow    10
+    int>0: max number of one seed loci per window
+
+seedSplitMin                12
+    int>0: min length of the seed sequences split by Ns or mate gap
+
+seedMapMin              5
+    int>0: min length of seeds to be mapped
+
+alignIntronMin              21
+    int: minimum intron size, genomic gap is considered intron if its length>=alignIntronMin, otherwise it is considered Deletion
+
+alignIntronMax              0
+    int: maximum intron size, if 0, max intron size will be determined by (2^winBinNbits)*winAnchorDistNbins
+
+alignMatesGapMax            0
+    int: maximum gap between two mates, if 0, max intron gap will be determined by (2^winBinNbits)*winAnchorDistNbins
+
+alignSJoverhangMin          5
+    int>0: minimum overhang (i.e. block size) for spliced alignments
+
+alignSJstitchMismatchNmax   0 -1 0 0
+    4*int>=0: maximum number of mismatches for stitching of the splice junctions (-1: no limit).
+                            (1) non-canonical motifs, (2) GT/AG and CT/AC motif, (3) GC/AG and CT/GC motif, (4) AT/AC and GT/AT motif.
+
+alignSJDBoverhangMin        3
+    int>0: minimum overhang (i.e. block size) for annotated (sjdb) spliced alignments
+
+alignSplicedMateMapLmin     0
+    int>0: minimum mapped length for a read mate that is spliced
+
+alignSplicedMateMapLminOverLmate 0.66
+    real>0: alignSplicedMateMapLmin normalized to mate length
+
+alignWindowsPerReadNmax     10000
+    int>0: max number of windows per read
+
+alignTranscriptsPerWindowNmax       100
+    int>0: max number of transcripts per window
+
+alignTranscriptsPerReadNmax               10000
+    int>0: max number of different alignments per read to consider
+
+alignEndsType           Local
+    string: type of read ends alignment
+                        Local             ... standard local alignment with soft-clipping allowed
+                        EndToEnd          ... force end-to-end read alignment, do not soft-clip
+                        Extend5pOfRead1   ... fully extend only the 5p of the read1, all other ends: local alignment
+                        Extend5pOfReads12 ... fully extend only the 5p of the both read1 and read2, all other ends: local alignment
+
+alignEndsProtrude       0    ConcordantPair
+    int, string:        allow protrusion of alignment ends, i.e. start (end) of the +strand mate downstream of the start (end) of the -strand mate
+                        1st word: int: maximum number of protrusion bases allowed
+                        2nd word: string:
+                                            ConcordantPair ... report alignments with non-zero protrusion as concordant pairs
+                                            DiscordantPair ... report alignments with non-zero protrusion as discordant pairs
+
+alignSoftClipAtReferenceEnds    Yes
+    string: allow the soft-clipping of the alignments past the end of the chromosomes
+                                Yes ... allow
+                                No  ... prohibit, useful for compatibility with Cufflinks
+
+alignInsertionFlush     None
+    string: how to flush ambiguous insertion positions
+                        None    ... insertions are not flushed
+                        Right   ... insertions are flushed to the right
+
+### Paired-End reads
+peOverlapNbasesMin          0
+    int>=0:             minimum number of overlapping bases to trigger mates merging and realignment. Specify >0 value to switch on the "merginf of overlapping mates" algorithm.
+
+peOverlapMMp                0.01
+    real, >=0 & <1:     maximum proportion of mismatched bases in the overlap area
+
+### Windows, Anchors, Binning
+
+winAnchorMultimapNmax           50
+    int>0: max number of loci anchors are allowed to map to
+
+winBinNbits                     16
+    int>0: =log2(winBin), where winBin is the size of the bin for the windows/clustering, each window will occupy an integer number of bins.
+
+winAnchorDistNbins              9
+    int>0: max number of bins between two anchors that allows aggregation of anchors into one window
+
+winFlankNbins                   4
+    int>0: log2(winFlank), where win Flank is the size of the left and right flanking regions for each window
+
+winReadCoverageRelativeMin      0.5
+    real>=0: minimum relative coverage of the read sequence by the seeds in a window, for STARlong algorithm only.
+
+winReadCoverageBasesMin      0
+    int>0: minimum number of bases covered by the seeds in a window , for STARlong algorithm only.
+
+### Chimeric Alignments
+chimOutType                 Junctions
+    string(s): type of chimeric output
+                            Junctions       ... Chimeric.out.junction
+                            SeparateSAMold  ... output old SAM into separate Chimeric.out.sam file
+                            WithinBAM       ... output into main aligned BAM files (Aligned.*.bam)
+                            WithinBAM HardClip  ... (default) hard-clipping in the CIGAR for supplemental chimeric alignments (default if no 2nd word is present)
+                            WithinBAM SoftClip  ... soft-clipping in the CIGAR for supplemental chimeric alignments
+
+chimSegmentMin              0
+    int>=0: minimum length of chimeric segment length, if ==0, no chimeric output
+
+chimScoreMin                0
+    int>=0: minimum total (summed) score of the chimeric segments
+
+chimScoreDropMax            20
+    int>=0: max drop (difference) of chimeric score (the sum of scores of all chimeric segments) from the read length
+
+chimScoreSeparation         10
+    int>=0: minimum difference (separation) between the best chimeric score and the next one
+
+chimScoreJunctionNonGTAG    -1
+    int: penalty for a non-GT/AG chimeric junction
+
+chimJunctionOverhangMin     20
+    int>=0: minimum overhang for a chimeric junction
+
+chimSegmentReadGapMax       0
+    int>=0: maximum gap in the read sequence between chimeric segments
+
+chimFilter                  banGenomicN
+    string(s): different filters for chimeric alignments
+                            None ... no filtering
+                            banGenomicN ... Ns are not allowed in the genome sequence around the chimeric junction
+
+chimMainSegmentMultNmax        10
+    int>=1: maximum number of multi-alignments for the main chimeric segment. =1 will prohibit multimapping main segments.
+
+chimMultimapNmax                    0
+    int>=0: maximum number of chimeric multi-alignments
+                                0 ... use the old scheme for chimeric detection which only considered unique alignments
+
+chimMultimapScoreRange          1
+    int>=0: the score range for multi-mapping chimeras below the best chimeric score. Only works with --chimMultimapNmax > 1
+
+chimNonchimScoreDropMin         20
+    int>=0: to trigger chimeric detection, the drop in the best non-chimeric alignment score with respect to the read length has to be greater than this value
+
+chimOutJunctionFormat           0
+    int: formatting type for the Chimeric.out.junction file
+                                0 ... no comment lines/headers
+                                1 ... comment lines at the end of the file: command line and Nreads: total, unique/multi-mapping
+
+### Quantification of Annotations
+quantMode                   -
+    string(s): types of quantification requested
+                            -                ... none
+                            TranscriptomeSAM ... output SAM/BAM alignments to transcriptome into a separate file
+                            GeneCounts       ... count reads per gene
+
+quantTranscriptomeBAMcompression    1
+    int: -2 to 10  transcriptome BAM compression level
+                            -2  ... no BAM output
+                            -1  ... default compression (6?)
+                             0  ... no compression
+                             10 ... maximum compression
+
+quantTranscriptomeSAMoutput BanSingleEnd_BanIndels_ExtendSoftclip
+    string: alignment filtering for TranscriptomeSAM output
+                            BanSingleEnd_BanIndels_ExtendSoftclip ... prohibit indels and single-end alignments, extend softclips - compatible with RSEM
+                            BanSingleEnd               ... prohibit single-end alignments, allow indels and softclips
+                            BanSingleEnd_ExtendSoftclip ... prohibit single-end alignments, extend softclips, allow indels
+
+
+### 2-pass Mapping
+twopassMode                 None
+    string: 2-pass mapping mode.
+                            None        ... 1-pass mapping
+                            Basic       ... basic 2-pass mapping, with all 1st pass junctions inserted into the genome indices on the fly
+
+twopass1readsN              -1
+    int: number of reads to process for the 1st step. Use very large number (or default -1) to map all reads in the first step.
+
+
+### WASP parameters
+waspOutputMode              None
+    string: WASP allele-specific output type. This is re-implementation of the original WASP mappability filtering by Bryce van de Geijn, Graham McVicker, Yoav Gilad & Jonathan K Pritchard. Please cite the original WASP paper: Nature Methods 12, 1061–1063 (2015), https://www.nature.com/articles/nmeth.3582 .
+                            SAMtag      ... add WASP tags to the alignments that pass WASP filtering
+
+### STARsolo (single cell RNA-seq) parameters
+soloType                    None
+    string(s): type of single-cell RNA-seq
+                            CB_UMI_Simple   ... (a.k.a. Droplet) one UMI and one Cell Barcode of fixed length in read2, e.g. Drop-seq and 10X Chromium.
+                            CB_UMI_Complex  ... multiple Cell Barcodes of varying length, one UMI of fixed length and one adapter sequence of fixed length are allowed in read2 only (e.g. inDrop, ddSeq).
+                            CB_samTagOut    ... output Cell Barcode as CR and/or CB SAm tag. No UMI counting. --readFilesIn cDNA_read1 [cDNA_read2 if paired-end] CellBarcode_read . Requires --outSAMtype BAM Unsorted [and/or SortedByCoordinate]
+                            SmartSeq        ... Smart-seq: each cell in a separate FASTQ (paired- or single-end), barcodes are corresponding read-groups, no UMI sequences, alignments deduplicated according to alignment start and end (after extending soft-clipped bases)
+
+soloCBtype                  Sequence
+    string: cell barcode type
+                            Sequence: cell barcode is a sequence (standard option)
+                            String: cell barcode is an arbitrary string
+
+soloCBwhitelist             -
+    string(s): file(s) with whitelist(s) of cell barcodes. Only --soloType CB_UMI_Complex allows more than one whitelist file.
+                            None            ... no whitelist: all cell barcodes are allowed
+
+soloCBstart                 1
+    int>0: cell barcode start base
+
+soloCBlen                   16
+    int>0: cell barcode length
+
+soloUMIstart                17
+    int>0: UMI start base
+
+soloUMIlen                  10
+    int>0: UMI length
+
+soloBarcodeReadLength       1
+    int: length of the barcode read
+                            1   ... equal to sum of soloCBlen+soloUMIlen
+                            0   ... not defined, do not check
+
+soloBarcodeMate             0
+    int: identifies which read mate contains the barcode (CB+UMI) sequence
+                            0   ... barcode sequence is on separate read, which should always be the last file in the --readFilesIn listed
+                            1   ... barcode sequence is a part of mate 1
+                            2   ... barcode sequence is a part of mate 2
+
+soloCBposition              -
+    strings(s):             position of Cell Barcode(s) on the barcode read.
+                            Presently only works with --soloType CB_UMI_Complex, and barcodes are assumed to be on Read2.
+                            Format for each barcode: startAnchor_startPosition_endAnchor_endPosition
+                            start(end)Anchor defines the Anchor Base for the CB: 0: read start; 1: read end; 2: adapter start; 3: adapter end
+                            start(end)Position is the 0-based position with of the CB start(end) with respect to the Anchor Base
+                            String for different barcodes are separated by space.
+                            Example: inDrop (Zilionis et al, Nat. Protocols, 2017):
+                            --soloCBposition  0_0_2_-1  3_1_3_8
+
+soloUMIposition             -
+    string:                  position of the UMI on the barcode read, same as soloCBposition
+                            Example: inDrop (Zilionis et al, Nat. Protocols, 2017):
+                            --soloCBposition  3_9_3_14
+
+soloAdapterSequence         -
+    string:                 adapter sequence to anchor barcodes. Only one adapter sequence is allowed.
+
+soloAdapterMismatchesNmax   1
+    int>0:                  maximum number of mismatches allowed in adapter sequence.
+
+soloCBmatchWLtype           1MM_multi
+    string:                 matching the Cell Barcodes to the WhiteList
+                            Exact                           ... only exact matches allowed
+                            1MM                             ... only one match in whitelist with 1 mismatched base allowed. Allowed CBs have to have at least one read with exact match.
+                            1MM_multi                       ... multiple matches in whitelist with 1 mismatched base allowed, posterior probability calculation is used choose one of the matches.
+                                                                Allowed CBs have to have at least one read with exact match. This option matches best with CellRanger 2.2.0
+                            1MM_multi_pseudocounts          ... same as 1MM_Multi, but pseudocounts of 1 are added to all whitelist barcodes.
+                            1MM_multi_Nbase_pseudocounts    ... same as 1MM_multi_pseudocounts, multimatching to WL is allowed for CBs with N-bases. This option matches best with CellRanger >= 3.0.0
+                            EditDist_2                    ... allow up to edit distance of 3 fpr each of the barcodes. May include one deletion + one insertion. Only works with --soloType CB_UMI_Complex. Matches to multiple passlist barcdoes are not allowed. Similar to ParseBio Split-seq pipeline. 
+
+soloInputSAMattrBarcodeSeq  -
+    string(s):              when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode sequence (in proper order).
+                            For instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeSeq CR UR .
+                            This parameter is required when running STARsolo with input from SAM. 
+    
+soloInputSAMattrBarcodeQual  -
+    string(s):              when inputting reads from a SAM file (--readsFileType SAM SE/PE), these SAM attributes mark the barcode qualities (in proper order).
+                            For instance, for 10X CellRanger or STARsolo BAMs, use --soloInputSAMattrBarcodeQual CY UY .
+                            If this parameter is '-' (default), the quality 'H' will be assigned to all bases.
+
+soloStrand                  Forward
+    string: strandedness of the solo libraries:
+                            Unstranded  ... no strand information
+                            Forward     ... read strand same as the original RNA molecule
+                            Reverse     ... read strand opposite to the original RNA molecule
+
+soloFeatures                Gene
+    string(s): genomic features for which the UMI counts per Cell Barcode are collected
+                            Gene            ... genes: reads match the gene transcript
+                            SJ              ... splice junctions: reported in SJ.out.tab
+                            GeneFull        ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns
+                            GeneFull_ExonOverIntron ... full gene (pre-mRNA): count all reads overlapping genes' exons and introns: prioritize 100% overlap with exons
+                            GeneFull_Ex50pAS        ... full gene (pre-RNA): count all reads overlapping genes' exons and introns: prioritize >50% overlap with exons. Do not count reads with 100% exonic overlap in the antisense direction.
+
+#####UnderDevelopment_begin : not supported - do not use
+                            Transcript3p    ... quantification of transcript for 3' protocols
+#####UnderDevelopment_end
+
+soloMultiMappers            Unique
+    string(s): counting method for reads mapping to multiple genes           
+                            Unique     ... count only reads that map to unique genes
+                            Uniform    ... uniformly distribute multi-genic UMIs to all genes
+                            Rescue     ... distribute UMIs proportionally to unique+uniform counts (~ first iteration of EM)
+                            PropUnique ... distribute UMIs proportionally to unique mappers, if present, and uniformly if not.
+                            EM         ... multi-gene UMIs are distributed using Expectation Maximization algorithm
+
+soloUMIdedup                1MM_All
+    string(s):              type of UMI deduplication (collapsing) algorithm
+                            1MM_All                     ... all UMIs with 1 mismatch distance to each other are collapsed (i.e. counted once).
+                            1MM_Directional_UMItools    ... follows the "directional" method from the UMI-tools by Smith, Heger and Sudbery (Genome Research 2017).
+                            1MM_Directional             ... same as 1MM_Directional_UMItools, but with more stringent criteria for duplicate UMIs
+                            Exact                       ... only exactly matching UMIs are collapsed.
+                            NoDedup                     ... no deduplication of UMIs, count all reads.
+                            1MM_CR                      ... CellRanger2-4 algorithm for 1MM UMI collapsing.
+
+soloUMIfiltering            -
+    string(s):              type of UMI filtering (for reads uniquely mapping to genes)
+                            -                  ... basic filtering: remove UMIs with N and homopolymers (similar to CellRanger 2.2.0).
+                            MultiGeneUMI       ... basic + remove lower-count UMIs that map to more than one gene.
+                            MultiGeneUMI_All   ... basic + remove all UMIs that map to more than one gene.
+                            MultiGeneUMI_CR    ... basic + remove lower-count UMIs that map to more than one gene, matching CellRanger > 3.0.0 .
+                                                   Only works with --soloUMIdedup 1MM_CR
+                                                
+soloOutFileNames            Solo.out/          features.tsv barcodes.tsv        matrix.mtx
+    string(s):              file names for STARsolo output:
+                            file_name_prefix   gene_names   barcode_sequences   cell_feature_count_matrix
+
+soloCellFilter              CellRanger2.2 3000 0.99 10
+    string(s):              cell filtering type and parameters
+                            None            ... do not output filtered cells
+                            TopCells        ... only report top cells by UMI count, followed by the exact number of cells
+                            CellRanger2.2   ... simple filtering of CellRanger 2.2. 
+                                                Can be followed by numbers: number of expected cells, robust maximum percentile for UMI count, maximum to minimum ratio for UMI count
+                                                The harcoded values are from CellRanger: nExpectedCells=3000;  maxPercentile=0.99;  maxMinRatio=10
+                            EmptyDrops_CR   ... EmptyDrops filtering in CellRanger flavor. Please cite the original EmptyDrops paper: A.T.L Lun et al, Genome Biology, 20, 63 (2019): https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1662-y
+                                                Can be followed by 10 numeric parameters:  nExpectedCells   maxPercentile   maxMinRatio   indMin   indMax   umiMin   umiMinFracMedian   candMaxN   FDR   simN 
+                                                The harcoded values are from CellRanger:             3000            0.99            10    45000    90000      500               0.01      20000  0.01  10000
+
+soloOutFormatFeaturesGeneField3    "Gene Expression"
+    string(s):                field 3 in the Gene features.tsv file. If "-", then no 3rd field is output.
+
+soloCellReadStats           None
+    string:                 Output reads statistics for each CB
+                            Standard    ... standard output
+
+#####UnderDevelopment_begin : not supported - do not use
+soloClusterCBfile           -
+    string:                 file containing the cluster information for cell barcodes, two columns: CB cluster_index. Only used with --soloFeatures Transcript3p
+#####UnderDevelopment_end
diff --git a/src/star/star_align_reads/script.py b/src/star/star_align_reads/script.py
new file mode 100644
index 00000000..2bde8798
--- /dev/null
+++ b/src/star/star_align_reads/script.py
@@ -0,0 +1,109 @@
+import tempfile
+import subprocess
+import shutil
+from pathlib import Path
+
+## VIASH START
+par = {
+    "input": [
+        "src/star/star_align_reads/test_data/a_R1.1.fastq",
+        "src/star/star_align_reads/test_data/a_R1.2.fastq",
+    ],
+    "input_r2": [
+        "src/star/star_align_reads/test_data/a_R2.1.fastq",
+        "src/star/star_align_reads/test_data/a_R2.2.fastq",
+    ],
+    "genomeDir": "src/star/star_align_reads/test_data/genome.fasta",
+    "aligned_reads": "aligned_reads.sam"
+}
+meta = {
+    "cpus": 8,
+    "temp_dir": "/tmp"
+}
+## VIASH END
+
+##################################################
+# check and process SE / PE R1 input files
+input_r1 = par["input"]
+readFilesIn = ",".join(par["input"])
+par["input"] = None
+
+# check and process PE R2 input files
+input_r2 = par["input_r2"]
+if input_r2 is not None:
+    if len(input_r1) != len(input_r2):
+        raise ValueError("The number of R1 and R2 files do not match.")
+    readFilesIn = [readFilesIn, ",".join(par["input_r2"])]
+    par["input_r2"] = None
+
+# store readFilesIn
+par["readFilesIn"] = readFilesIn
+
+##################################################
+
+# determine readFilesCommand
+if input_r1[0].endswith(".gz"):
+    print(">> Input files are gzipped, setting readFilesCommand to zcat", flush=True)
+    par["readFilesCommand"] = "zcat"
+elif input_r1[0].endswith(".bz2"):
+    print(">> Input files are bzipped, setting readFilesCommand to bzcat", flush=True)
+    par["readFilesCommand"] = "bzcat"
+
+##################################################
+# store output paths
+expected_outputs = {
+    "aligned_reads": ["Aligned.out.sam", "Aligned.out.bam"],
+    "reads_per_gene": "ReadsPerGene.out.tab",
+    "chimeric_junctions": "Chimeric.out.junction",
+    "log": "Log.final.out",
+    "splice_junctions": "SJ.out.tab",
+    "unmapped": "Unmapped.out.mate1",
+    "unmapped_r2": "Unmapped.out.mate2"
+}
+output_paths = {name: par[name] for name in expected_outputs.keys()}
+for name in expected_outputs.keys():
+    par[name] = None
+
+##################################################
+# process other args
+par["runMode"] = "alignReads"
+
+if "cpus" in meta and meta["cpus"]:
+    par["runThreadN"] = meta["cpus"]
+
+##################################################
+# run STAR and move output to final destination
+with tempfile.TemporaryDirectory(prefix="star-", dir=meta["temp_dir"], ignore_cleanup_errors=True) as temp_dir:
+    print(">> Constructing command", flush=True)
+
+    # set output paths
+    temp_dir = Path(temp_dir)
+    par["outTmpDir"] = temp_dir / "tempdir"
+    out_dir = temp_dir / "out"
+    par["outFileNamePrefix"] = f"{out_dir}/" # star needs this slash
+
+    # construct command
+    cmd_args = [ "STAR" ]
+    for name, value in par.items():
+        if value is not None:
+            val_to_add = value if isinstance(value, list) else [value]
+            cmd_args.extend([f"--{name}"] + [str(x) for x in val_to_add])
+    print("", flush=True)
+
+    # run command
+    print(">> Running STAR with command:", flush=True)
+    print(f"+ {' '.join(cmd_args)}", end="\n\n", flush=True)
+    subprocess.run(
+        cmd_args,
+        check=True
+    )
+    print(">> STAR finished successfully", end="\n\n", flush=True)
+
+    # move output to final destination
+    print(">> Moving output to final destination", flush=True)
+    for name, paths in expected_outputs.items():
+        for expected_path in [paths] if isinstance(paths, str) else paths:
+            expected_full_path = out_dir / expected_path
+            if output_paths[name] and expected_full_path.is_file():
+                print(f">> Moving {expected_path} to {output_paths[name]}", flush=True)
+                shutil.move(expected_full_path, output_paths[name])
diff --git a/src/star/star_align_reads/test.sh b/src/star/star_align_reads/test.sh
new file mode 100644
index 00000000..374b9014
--- /dev/null
+++ b/src/star/star_align_reads/test.sh
@@ -0,0 +1,173 @@
+#!/bin/bash
+
+set -e
+
+## VIASH START
+meta_executable="target/docker/star/star_align_reads/star_align_reads"
+meta_resources_dir="src/star/star_align_reads"
+## VIASH END
+
+#########################################################################################
+
+# helper functions
+assert_file_exists() {
+  [ -f "$1" ] || (echo "File '$1' does not exist" && exit 1)
+}
+assert_file_doesnt_exist() {
+  [ ! -f "$1" ] || (echo "File '$1' exists but shouldn't" && exit 1)
+}
+assert_file_empty() {
+  [ ! -s "$1" ] || (echo "File '$1' is not empty but should be" && exit 1)
+}
+assert_file_not_empty() {
+  [ -s "$1" ] || (echo "File '$1' is empty but shouldn't be" && exit 1)
+}
+assert_file_contains() {
+  grep -q "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1)
+}
+assert_file_not_contains() {
+  grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1)
+}
+assert_file_contains_regex() {
+  grep -q -E "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1)
+}
+assert_file_not_contains_regex() {
+  grep -q -E "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1)
+}
+
+#########################################################################################
+echo "> Prepare test data"
+
+cat > reads_R1.fastq <<'EOF'
+@SEQ_ID1
+ACGCTGCCTCATAAGCCTCACACAT
++
+IIIIIIIIIIIIIIIIIIIIIIIII
+@SEQ_ID2
+ACCCGCAAGATTAGGCTCCGTACAC
++
+!!!!!!!!!!!!!!!!!!!!!!!!!
+EOF
+
+cat > reads_R2.fastq <<'EOF'
+@SEQ_ID1
+ATGTGTGAGGCTTATGAGGCAGCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIII
+@SEQ_ID2
+GTGTACGGAGCCTAATCTTGCAGGG
++
+!!!!!!!!!!!!!!!!!!!!!!!!!
+EOF
+
+cat > genome.fasta <<'EOF'
+>chr1
+TGGCATGAGCCAACGAACGCTGCCTCATAAGCCTCACACATCCGCGCCTATGTTGTGACTCTCTGTGAGCGTTCGTGGG
+GCTCGTCACCACTATGGTTGGCCGGTTAGTAGTGTGACTCCTGGTTTTCTGGAGCTTCTTTAAACCGTAGTCCAGTCAA
+TGCGAATGGCACTTCACGACGGACTGTCCTTAGCTCAGGGGA
+EOF
+
+cat > genes.gtf <<'EOF'
+chr1    example_source  gene    0    50   .   +   .   gene_id "gene1"; transcript_id "transcript1";
+chr1    example_source  exon    20   40   .   +   .   gene_id "gene1"; transcript_id "transcript1"; 
+EOF
+
+echo "> Generate index"
+STAR \
+  ${meta_cpus:+--runThreadN $meta_cpus} \
+  --runMode genomeGenerate \
+  --genomeDir "index/" \
+  --genomeFastaFiles "genome.fasta" \
+  --sjdbGTFfile "genes.gtf" \
+  --genomeSAindexNbases 2
+  
+#########################################################################################
+
+mkdir star_align_reads_se
+cd star_align_reads_se
+
+echo "> Run star_align_reads on SE"
+"$meta_executable" \
+  --input "../reads_R1.fastq" \
+  --genomeDir "../index/" \
+  --aligned_reads "output.sam" \
+  --log "log.txt" \
+  --outReadsUnmapped "Fastx" \
+  --unmapped "unmapped.sam" \
+  --quantMode "TranscriptomeSAM;GeneCounts" \
+  --reads_per_gene "reads_per_gene.tsv" \
+  --outSJtype Standard \
+  --splice_junctions "splice_junctions.tsv" \
+  ${meta_cpus:+---cpus $meta_cpus}
+
+# TODO: Test data doesn't contain any chimeric reads yet
+# --chimOutType "Junctions" \
+# --chimeric_junctions "chimeric_junctions.tsv" \
+
+echo ">> Check if output exists"
+assert_file_exists "output.sam"
+assert_file_exists "log.txt"
+assert_file_exists "reads_per_gene.tsv"
+# assert_file_exists "chimeric_junctions.tsv"
+assert_file_exists "splice_junctions.tsv"
+assert_file_exists "unmapped.sam"
+
+echo ">> Check if output contents are not empty"
+assert_file_not_empty "output.sam"
+assert_file_not_empty "log.txt"
+assert_file_not_empty "reads_per_gene.tsv"
+# assert_file_not_empty "chimeric_junctions.tsv"
+# assert_file_not_empty "splice_junctions.tsv" # TODO: test data doesn't contain any splice junctions yet
+assert_file_not_empty "unmapped.sam"
+
+echo ">> Check if output contents are correct"
+assert_file_contains "log.txt" "Number of input reads \\|	2"
+assert_file_contains "log.txt" "Number of reads unmapped: too short \\|	1"
+assert_file_contains "log.txt" "Uniquely mapped reads number \\|	1"
+assert_file_contains "reads_per_gene.tsv" "gene1	1	1	0"
+assert_file_contains "reads_per_gene.tsv" "N_unmapped	1	1	1"
+assert_file_contains "output.sam" "SEQ_ID1	0	chr1	17	255	25M	\\*	0	0	ACGCTGCCTCATAAGCCTCACACAT	IIIIIIIIIIIIIIIIIIIIIIIII	NH:i:1	HI:i:1	AS:i:24	nM:i:0"
+assert_file_contains "unmapped.sam" "@SEQ_ID2 0:N:"
+assert_file_contains "unmapped.sam" "ACCCGCAAGATTAGGCTCCGTACAC"
+
+cd ..
+
+#########################################################################################
+
+mkdir star_align_reads_pe_minimal
+cd star_align_reads_pe_minimal
+
+echo ">> Run star_align_reads on PE"
+"$meta_executable" \
+  --input ../reads_R1.fastq \
+  --input_r2 ../reads_R2.fastq \
+  --genomeDir ../index/ \
+  --aligned_reads output.bam \
+  --log log.txt \
+  --outReadsUnmapped Fastx \
+  --unmapped unmapped_r1.bam \
+  --unmapped_r2 unmapped_r2.bam \
+  ${meta_cpus:+---cpus $meta_cpus}
+
+echo ">> Check if output exists"
+assert_file_exists "output.bam"
+assert_file_exists "log.txt"
+assert_file_exists "unmapped_r1.bam"
+assert_file_exists "unmapped_r2.bam"
+
+echo ">> Check if output contents are not empty"
+assert_file_not_empty "output.bam"
+assert_file_not_empty "log.txt"
+assert_file_not_empty "unmapped_r1.bam"
+assert_file_not_empty "unmapped_r2.bam"
+
+echo ">> Check if output contents are correct"
+assert_file_contains "log.txt" "Number of input reads \\|	2"
+assert_file_contains "log.txt" "Number of reads unmapped: too short \\|	1"
+assert_file_contains "log.txt" "Uniquely mapped reads number \\|	1"
+
+cd ..
+
+#########################################################################################
+
+echo "> Test successful"
diff --git a/src/star/star_align_reads/utils/process_params.R b/src/star/star_align_reads/utils/process_params.R
new file mode 100644
index 00000000..ccdc50b3
--- /dev/null
+++ b/src/star/star_align_reads/utils/process_params.R
@@ -0,0 +1,189 @@
+library(tidyverse)
+
+# This script processes the STAR aligner's help file
+# to create a viash argument_groups.yaml file.
+
+local_file <- "src/star/star_align_reads/help.txt"
+yaml_file <- "src/star/star_align_reads/argument_groups.yaml"
+
+param_txt <- readr::read_lines(local_file)
+
+# replace non-ascii characters with their ascii approximations
+param_txt <- iconv(param_txt, "UTF-8", "ASCII//TRANSLIT")
+
+dev_begin <- grep("#####UnderDevelopment_begin", param_txt)
+dev_end <- grep("#####UnderDevelopment_end", param_txt)
+
+# strip development sections
+nondev_ix <- unlist(map2(c(1, dev_end + 1), c(dev_begin - 1, length(param_txt)), function(i, j) {
+  if (i >= 1 && i < j) {
+    seq(i, j, 1)
+  } else {
+    NULL
+  }
+}))
+
+param_txt2 <- param_txt[nondev_ix]
+
+# strip comments
+param_txt3 <- param_txt2[-grep("^#[^#]", param_txt2)]
+
+# detect groups
+group_ix <- grep("^### ", param_txt3)
+
+out <- map2_dfr(
+  group_ix,
+  c(group_ix[-1] - 1, length(param_txt3)),
+  function(group_start, group_end) {
+    # cat("group_start <- ", group_start, "; group_end <- ", group_end, "\n", sep = "")
+    group_name <- gsub("^### ", "", param_txt3[[group_start]])
+
+    group_txt <- param_txt3[seq(group_start + 1, group_end)]
+
+    arg_ix <- grep("^[^ ]", group_txt)
+
+    arguments <- map2_dfr(
+      arg_ix,
+      c(arg_ix[-1] - 1, length(group_txt)),
+      function(arg_start, arg_end) {
+        # cat("arg_start <- ", arg_start, "; arg_end <- ", arg_end, "\n", sep = "")
+
+        # process name and default
+        first_txt <- group_txt[[arg_start]]
+        first_regex <- "^([^ ]*) +(.*) *$"
+        if (!grepl(first_regex, first_txt)) {
+          stop("Line '", first_txt, "' did not match regex '", first_regex, "'")
+        }
+        name <- gsub(first_regex, "\\1", first_txt)
+        default <- gsub(first_regex, "\\2", first_txt)
+
+        # process type and first description
+        second_txt <- group_txt[[arg_start + 1]]
+        second_regex <- "^   +([^:]*):[ ]+(.*)$"
+        if (!grepl(second_regex, second_txt)) {
+          stop("Line '", second_txt, "' did not match regex '", second_regex, "'")
+        }
+        type <- gsub(second_regex, "\\1", second_txt)
+        desc_start <- str_trim(gsub(second_regex, "\\2", second_txt))
+
+        # process more description
+        desc_cont1 <- group_txt[seq(arg_start + 2, arg_end)]
+
+        desc <-
+          if (sum(str_length(desc_cont1)) == 0) {
+            desc_start
+          } else {
+            # detect margin
+            margins <- str_extract(desc_cont1, "^( +)") %>% na.omit
+            margin <- margins[[which.min(str_length(margins))]]
+            desc_cont2 <- gsub(paste0("^", margin), "", desc_cont1)
+            desc_cont3 <- ifelse(grepl("\\.\\.\\.", desc_cont2), paste0("- ", desc_cont2), desc_cont2)
+            desc_cont4 <- str_trim(desc_cont3)
+
+            # construct desc
+            str_trim(paste0(c(desc_start, "", desc_cont4), "\n", collapse = ""))
+          }
+
+        tibble(
+          group_name,
+          name,
+          default,
+          type,
+          description = desc
+        )
+      }
+    )
+
+    arguments
+  }
+)
+
+# todo: manually fix alignEndsProtrude?
+# assigning types
+type_map <- c("string" = "string", "int" = "integer", "real" = "double", "double" = "double", "int, string" = "string")
+file_args <- c("genomeDir", "readFilesIn", "sjdbGTFfile", "genomeFastaFiles", "genomeChainFiles", "readFilesManifest")
+long_args <- c("limitGenomeGenerateRAM", "limitIObufferSize", "limitOutSAMoneReadBytes", "limitBAMsortRAM")
+required_args <- c("genomeDir", "readFilesIn")
+
+# converting examples
+as_safe_int <- function(x) tryCatch({as.integer(x)}, warning = function(e) { bit64::as.integer64(x) })
+safe_split <- function(x) strsplit(x, "'[^']*'(*SKIP)(*F)|\"[^\"]*\"(*SKIP)(*F)|\\s+", perl = TRUE)[[1]] %>% gsub("^[\"']|[\"']$", "", .)
+trafos <- list(
+  string = function(x) x,
+  integer = as_safe_int,
+  double = as.numeric,
+  strings = function(x) safe_split(x),
+  integers = function(x) sapply(safe_split(x), as_safe_int),
+  doubles = function(x) as.numeric(safe_split(x))
+)
+# remove arguments that are not relevant for viash
+removed_args <- c("versionGenome", "parametersFiles", "sysShell", "runDirPerm")
+# these settings are defined by the viash component
+manual_args <- c("runThreadN", "outTmpDir", "runMode", "outFileNamePrefix", "readFilesIn")
+
+# make viash-like values
+out2 <- out %>%
+  # remove arguments that are not relevant for viash
+  filter(!name %in% c(removed_args, manual_args)) %>%
+  # remove arguments that are related to a different runmode
+  filter(!grepl("--runMode", description) | grepl("--runMode alignReads", description)) %>%
+  filter(!grepl("--runMode", group_name) | grepl("--runMode alignReads", group_name)) %>%
+  filter(!grepl("STARsolo", group_name)) %>%
+  mutate(
+    viash_arg = paste0("--", name),
+    type_step1 = type %>%
+      str_replace_all(".*(int, string|string|int|real|double)\\(?(s?).*", "\\1\\2"),
+    viash_type = type_map[gsub("(int, string|string|int|real|double).*", "\\1", type_step1)],
+    multiple = type_step1 == "int, string" | grepl("s$", type_step1) | grepl("^[4N][\\* ]", type),
+    default_step1 = default %>%
+      {ifelse(. %in% c("-", "None"), NA_character_, .)},
+    viash_default =
+      mapply(
+        default_step1,
+        paste0(viash_type, ifelse(multiple, "s", "")),
+        FUN = function(str, typ) trafos[[typ]](str)
+      ),
+    # viash_type = ifelse(sapply(viash_default, bit64::is.integer64), "long", viash_type),
+    # update type
+    viash_type = case_when(
+      name %in% long_args ~ "long",
+      name %in% file_args ~ "file",
+      TRUE ~ viash_type
+    ),
+    # turn longs into character because yaml::write_yaml doesn't handle longs well
+    viash_default = ifelse(sapply(viash_default, bit64::is.integer64), map(viash_default, as.character), viash_default),
+    group_name = gsub(" - .*", "", group_name),
+    required = ifelse(name %in% required_args, TRUE, NA)
+  )
+print(out2, n = 200)
+out2 %>% mutate(i = row_number()) %>% 
+  # filter(is.na(default_step1) != is.na(viash_default)) %>%
+  select(-group_name, -description)
+
+out2 %>% filter(!grepl("--runMode", description) | grepl("--runMode alignReads", description))
+
+argument_groups <- map(unique(out2$group_name), function(group_name) {
+  args <- out2 %>%
+    filter(group_name == !!group_name) %>%
+    pmap(function(viash_arg, viash_type, multiple, viash_default, description, required, ...) {
+      li <- lst(
+        name = viash_arg,
+        type = viash_type,
+        description = description
+      )
+      if (all(!is.na(viash_default))) {
+        li$example <- viash_default
+      }
+      if (!is.na(multiple) && multiple) {
+        li$multiple <- multiple
+        li$multiple_sep <- ";"
+      }
+      if (!is.na(required) && required) {
+        li$required <- required
+      }
+      li
+    })
+  list(name = group_name, arguments = args)
+})
+
+yaml::write_yaml(list(argument_groups = argument_groups), yaml_file)