md formatting changes in config

emmarousseau · Jul 5, 2024 · b4fe24c · b4fe24c
1 parent ab50e2d
commit b4fe24c
Showing 1 changed file with 71 additions and 88 deletions.
diff --git a/src/umi_tools/umi_tools_dedup/config.vsh.yaml b/src/umi_tools/umi_tools_dedup/config.vsh.yaml
@@ -22,40 +22,37 @@ argument_groups:
       - name: --in_sam
         type: boolean_true
         description: |
-          By default, inputs are assumed to be in BAM format. Use this options 
-          to specify the use of SAM format for input.
+          By default, inputs are assumed to be in BAM format. Use this options to specify the use of SAM
+          format for input.
       - name: --bai
         type: file
         description: BAM index
       - name: --random_seed
         type: integer
-        description: |
-          Random seed to initialize number generator with.
+        description: Random seed to initialize number generator with.
 
   - name: Outputs
     arguments:
       - name: --output
         alternatives: --stdout
         type: file
-        description: Deduplicated BAM file
+        description: Deduplicated BAM file.
         required: true
         direction: output
       - name: --out_sam
         type: boolean_true
         description: |
-          By default, outputa are written in BAM format. Use this options to 
-          specify the use of SAM format for output.
+          By default, outputa are written in BAM format. Use this options to specify the use of SAM format
+          for output.
       - name: --paired
         type: boolean_true
         description: |
-          BAM is paired end - output both read pairs. This will also force the
-          use of the template length to determine reads with the same mapping
-          coordinates.
+          BAM is paired end - output both read pairs. This will also force the use of the template length
+          to determine reads with the same mapping coordinates.
       - name: --output_stats
         type: string
         description: |
-          Generate files containing UMI based deduplication statistics files with this prefix
-          in the file names.
+          Generate files containing UMI based deduplication statistics files with this prefix in the file names.
       - name: --extract_umi_method
         type: string
         choices: [read_id, tag, umis]
@@ -69,35 +66,30 @@ argument_groups:
       - name: --umi_tag
         type: string
         description: |
-          The tag containing the UMI sequence. 
-          This is only required if the extract_umi_method is set to tag.
+          The tag containing the UMI sequence. This is only required if the extract_umi_method is set to tag.
       - name: --umi_separator
         type: string
         description: |
-          The separator used to separate the UMI from the read sequence. 
-          This is only required if the extract_umi_method is set to id_read. [_]
+          The separator used to separate the UMI from the read sequence. This is only required if the
+          extract_umi_method is set to id_read. Default: `_`.
         example: '_'
       - name: --umi_tag_split
         type: string
-        description: |
-          Separate the UMI in tag by <SPLIT> and take the first element.
+        description: Separate the UMI in tag by <SPLIT> and take the first element.
       - name: --umi_tag_delimiter
         type: string
-        description: |
-          Separate the UMI in by <DELIMITER> and concatenate the elements
+        description: Separate the UMI in by <DELIMITER> and concatenate the elements.
       - name: --cell_tag
         type: string
         description: |
-          The tag containing the cell barcode sequence. 
-          This is only required if the extract_umi_method is set to tag.
+          The tag containing the cell barcode sequence. This is only required if the extract_umi_method
+          is set to tag.
       - name: --cell_tag_split
         type: string
-        description: |
-          Separate the cell barcode in tag by <SPLIT> and take the first element.
+        description: Separate the cell barcode in tag by <SPLIT> and take the first element.
       - name: --cell_tag_delimiter
         type: string
-        description: |
-          Separate the cell barcode in by <DELIMITER> and concatenate the elements
+        description: Separate the cell barcode in by <DELIMITER> and concatenate the elements.
 
   - name: Grouping Options
     arguments:    
@@ -116,89 +108,80 @@ argument_groups:
       - name: --edit_distance_threshold
         type: integer
         description: |
-          For the adjacency and cluster methods the threshold for the edit 
-          distance to connect two UMIs in the network can be increased. The 
-          default value of 1 works best unless the UMI is very long (>14bp). [1]
+          For the adjacency and cluster methods the threshold for the edit distance to connect two
+          UMIs in the network can be increased. The default value of 1 works best unless the UMI is
+          very long (>14bp). Default: `1`.
         example: 1
       - name: --spliced_is_unique
         type: boolean_true
         description: |
-          Causes two reads that start in the same position on the same strand
-          and having the same UMI to be considered unique if one is spliced 
-          and the other is not. (Uses the 'N' cigar operation to test for splicing).
+          Causes two reads that start in the same position on the same strand and having the same UMI
+          to be considered unique if one is spliced and the other is not. (Uses the 'N' cigar operation
+          to test for splicing).
       - name: --soft_clip_threshold
         type: integer
         description: |
-          Mappers that soft clip will sometimes do so rather than mapping a
-          spliced read if there is only a small overhang over the exon junction.
-          By setting this option, you can treat reads with at least this many
-          bases soft-clipped at the 3' end as spliced. [4]
+          Mappers that soft clip will sometimes do so rather than mapping a spliced read if there is only
+          a small overhang over the exon junction. By setting this option, you can treat reads with at
+          least this many bases soft-clipped at the 3' end as spliced. Default: `4`.
         example: 4
       - name: --multimapping_detection_method
         type: string
         description: |
-          If the sam/bam contains tags to identify multimapping reads, you can
-          specify for use when selecting the best read at a given loci. Supported
-          tags are "NH", "X0" and "XT". If not specified, the read with the highest
-          mapping quality will be selected.
+          If the sam/bam contains tags to identify multimapping reads, you can specify for use when selecting
+          the best read at a given loci. Supported tags are `NH`, `X0` and `XT`. If not specified, the read
+          with the highest mapping quality will be selected.
       - name: --read_length
         type: boolean_true
-        description: |
-          Use the read length as a criteria when deduping, for e.g sRNA-Seq.
+        description: Use the read length as a criteria when deduping, for e.g. sRNA-Seq.
 
   - name: Single-cell RNA-Seq Options
     arguments:
       - name: --per_gene
         type: boolean_true
         description: |
-          Reads will be grouped together if they have the same gene. This is useful
-          if your library prep generates PCR duplicates with non identical alignment
-          positions such as CEL-Seq. Note this option is hardcoded to be on with the
-          count command. I.e counting is always performed per-gene. Must be combined
-          with either --gene_tag or --per_contig option.
+          Reads will be grouped together if they have the same gene. This is useful if your library prep
+          generates PCR duplicates with non identical alignment positions such as CEL-Seq. Note this option
+          is hardcoded to be on with the count command. I.e. counting is always performed per-gene. Must be
+          combined with either --gene_tag or --per_contig option.
       - name: --gene_tag
         type: string
         description: |
-          Deduplicate per gene. The gene information is encoded in the bam read tag
-          specified.
+          Deduplicate per gene. The gene information is encoded in the bam read tag specified.
       - name: --assigned_status_tag
         type: string
         description: |
-          BAM tag which describes whether a read is assigned to a gene. Defaults to
-          the same value as given for --gene_tag.
+          BAM tag which describes whether a read is assigned to a gene. Defaults to the same value as given
+          for --gene_tag.
       - name: --skip_tags_regex
         type: string
         description: |
-          Use in conjunction with the --assigned_status_tag option to skip any reads
-          where the tag matches this regex. Default ("^[__|Unassigned]") matches
-          anything which starts with "__" or "Unassigned".
+          Use in conjunction with the --assigned_status_tag option to skip any reads where the tag matches
+          this regex. Default ("^[__|Unassigned]") matches anything which starts with "__" or "Unassigned".
       - name: --per_contig
         type: boolean_true
         description: |
-          Deduplicate per contig (field 3 in BAM; RNAME). All reads with the same
-          contig will be considered to have the same alignment position. This is
-          useful if you have aligned to a reference transcriptome with one
-          transcript per gene. If you have aligned to a transcriptome with more
-          than one transcript per gene, you can supply a map between transcripts
-          and gene using the --gene_transcript_map option.
+          Deduplicate per contig (field 3 in BAM; RNAME). All reads with the sam contig will be considered to
+          have the same alignment position. This is useful if you have aligned to a reference transcriptome
+          with one transcript per gene. If you have aligned to a transcriptome with more than one transcript
+          per gene, you can supply a map between transcripts and gene using the --gene_transcript_map option.
       - name: --gene_transcript_map
         type: file
         description: |
-          A file containing a mapping between gene names and transcript names.
-          The file should be tab separated with the gene name in the first column
-          and the transcript name in the second column.
+          A file containing a mapping between gene names and transcript names. The file should be tab
+          separated with the gene name in the first column and the transcript name in the second column.
       - name: --per_cell
         type: boolean_true
         description: |
-          Reads will only be grouped together if they have the same cell barcode.
-          Can be combined with --per_gene.
+          Reads will only be grouped together if they have the same cell barcode. Can be combined with
+          --per_gene.
   
   - name: SAM/BAM Options
     arguments:
       - name: --mapping_quality
         type: integer
         description: |
-          Minimium mapping quality (MAPQ) for a read to be retained. [0]
+          Minimium mapping quality (MAPQ) for a read to be retained. Default: `0`.
         example: 0
       - name: --unmapped_reads
         type: string
@@ -217,7 +200,8 @@ argument_groups:
           The options are:
             * "discard": Discard all chimeric read pairs.
             * "use":     Deduplicate using read1 only. (default)
-            * "output":  Output chimeric pairs without UMI grouping/deduplication. Only available in umi_tools group.
+            * "output":  Output chimeric pairs without UMI grouping/deduplication. Only available in
+                         umi_tools group.
         example: "use"
       - name: --unpaired_reads
         type: string
@@ -227,42 +211,38 @@ argument_groups:
           The options are: 
             * "discard": Discard all unmapped reads.
             * "use": If read2 is unmapped, deduplicate using read1 only. Requires --paired. (default)
-            * "output":  Output unmapped reads/read pairs without UMI grouping/deduplication. Only available in umi_tools group.
+            * "output":  Output unmapped reads/read pairs without UMI grouping/deduplication. Only available
+                         in umi_tools group.
         example: "use"
       - name: --ignore_umi
         type: boolean_true
-        description: |
-          Ignore the UMI and group reads using mapping coordinates only.
+        description: Ignore the UMI and group reads using mapping coordinates only.
       - name: --subset
         type: double
         description: |
-          Only consider a fraction of the reads, chosen at random. This is useful
-          for doing saturation analyses.
+          Only consider a fraction of the reads, chosen at random. This is useful for doing saturation
+          analyses.
       - name: --chrom
         type: string
-        description: |
-          Only consider a single chromosome. This is useful for debugging/testing 
-          purposes.
+        description: Only consider a single chromosome. This is useful for debugging/testing purposes.
 
   - name: Group/Dedup Options
     arguments:
       - name: --no_sort_output
         type: boolean_true
         description: |
-          By default, output is sorted. This involves the use of a temporary unsorted
-          file (saved in --temp_dir). Use this option to turn off sorting.
+          By default, output is sorted. This involves the use of a temporary unsorted file (saved in
+          --temp_dir). Use this option to turn off sorting.
       - name: --buffer_whole_contig
         type: boolean_true
         description: |
-          Forces dedup to parse an entire contig before yielding any reads for
-          deduplication. This is the only way to absolutely guarantee that all reads
-          with the same start position are grouped together for deduplication since
-          dedup uses the start position of the read, not the alignment coordinate on
-          which the reads are sorted. However, by default, dedup reads for another
-          1000bp before outputting read groups which will avoid any reads being missed
-          with short read sequencing (<1000bp).
+          Forces dedup to parse an entire contig before yielding any reads for deduplication. This is the
+          only way to absolutely guarantee that all reads with the same start position are grouped together
+          for deduplication since dedup uses the start position of the read, not the alignment coordinate on
+          which the reads are sorted. However, by default, dedup reads for another 1000bp before outputting
+          read groups which will avoid any reads being missed with short read sequencing (<1000bp).
   
-  - name: Common UMI-tools Options
+  - name: Common Options
     arguments:
       - name: --log
         alternatives: -L
@@ -274,7 +254,8 @@ argument_groups:
       - name: --verbose
         alternatives: -v
         type: integer
-        description: Log level. The higher, the more output. [0]
+        description: |
+          Log level. The higher, the more output. Default: `0`.
         example: 0
       - name: --error
         alternatives: -E
@@ -287,14 +268,16 @@ argument_groups:
       - name: --compresslevel
         type: integer
         description: |
-          Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default. [6]
+          Level of Gzip compression to use. Default=6 matches GNU gzip rather than python gzip default.
+          Default: `6`.
         example: 6
       - name: --timeit
         type: file
         description: Store timing information in file.
       - name: --timeit_name
         type: string
-        description: Name in timing file for this class of jobs. [all]
+        description: |
+          Name in timing file for this class of jobs. Default: `all`.
         example: "all"
       - name: --timeit_header
         type: string