🔥 first awesome commit

papaemmelab · Mar 6, 2019 · 56f7b30 · 56f7b30
commit 56f7b30
Show file tree

Hide file tree

Showing 4 changed files with 315 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Max Levine
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,60 @@
+Nim Program to split bed files based on estimated read counts in each interval based on the index of a bam file.
+
+This program splits based on read estimates without counting the reads.
+If more exact read counts are need use [split_bed_by_reads](https://github.com/papaemmelab/split_bed_by_reads).
+
+This is useful for parallelization of bioinformatics algorithms that allow input regions.
+When runtime of the algorithms is linearly correlated with number of reads in the region,
+this program should create regions with similar runtime.
+
+# How It Works
+
+Bam indexs (bai files) store information on file byte offsets for different regions of the bam.
+This allows for random access of reads from the bam file. The linear index is split up into 16,384 base chunks.
+It also stores the total number of mapped and unmapped reads for each contig. Using the total number of reads and bytes for a contig we can estimate the average number of bytes per read.
+The bytes per read value is then used to estimate the number of reads in the 16,384 base chunks overlapped by the requested regions and split based on a maximum.
+
+# Installation:
+
+Install nim and compile nim script
+
+```
+curl https://nim-lang.org/choosenim/init.sh -sSf|sed "s/need_tty=yes/need_tty=no/g" | sh
+export PATH=$HOME/.nimble/bin:$PATH
+
+export PATH=${OPT_DIR}/.nimble/bin:$PATH
+nimble install -y https://github.com/papaemmelab/split_bed_by_index --nimbleDir:${OPT_DIR}/.nimble
+```
+
+Download the binaries:
+
+```
+wget -O ${BIN_DIR}/split_bed_by_index https://github.com/papaemmelab/split_bed_by_index/releases/download/0.1.0/split_bed_by_index && \
+chmod +x ${BIN_DIR}/split_bed_by_index
+export PATH=${BIN_DIR}:$PATH
+```
+
+# Usage:
+
+```
+Split bed intervals by estimated read counts from bam index
+
+Usage:
+  split_bed_by_index [options] <BED> <BAM> <OUT>
+
+Arguments:
+  <BED>     Input bed file
+  <BAM>     Input bam file
+  <OUT>     Output bed file
+
+Options:
+  -h --help                 Show this screen.
+  -v --version              Show version.
+  -c --count <count>        Number of reads to split  [default: 1000000].
+  -m --merge                Merge overlapping regions before split.
+```
+
+# Credit
+* Max Levine [@mflevine](https://github.com/mflevine)
+* Inspired by [indexcov](https://github.com/brentp/goleft/tree/master/indexcov) by Brent Pederson [@brentp](https://github.com/brentp)
+* [hts-nim-tools](https://github.com/brentp/hts-nim-tools) by Brent Pederson [@brentp](https://github.com/brentp)
diff --git a/split_bed_by_reads.nimble b/split_bed_by_reads.nimble
@@ -0,0 +1,13 @@
+# Package
+
+version       = "0.1.0"
+author        = "Max Levine"
+description   = "Split bed by bai estimated read counts command-line tool"
+license       = "MIT"
+srcDir        = "src"
+bin           = @["split_bed_by_index"]
+
+
+# Dependencies
+
+requires "nim >= 0.19.0", "hts", "docopt"
diff --git a/src/split_bed_by_index.nim b/src/split_bed_by_index.nim
@@ -0,0 +1,221 @@
+let doc = """
+Split bed intervals by estimated read counts from bam index
+
+Usage:
+  split_bed_by_index [options] <BED> <BAM> <OUT>
+
+Arguments:
+  <BED>     Input bed file
+  <BAM>     Input bam file
+  <OUT>     Output bed file
+
+Options:
+  -h --help                 Show this screen.
+  -v --version              Show version.
+  -c --count <count>        Number of reads to split  [default: 1000000].
+  -m --merge                Merge overlapping regions before split.
+"""
+
+import algorithm
+import docopt
+import hts
+import math
+import os
+import sequtils
+import streams
+import strutils
+import tables
+import times
+
+const chunk_size = 16_384
+
+proc `/`(s1: openArray[int|float], s2: openArray[int|float]): seq[float] =
+    let len = s1.len
+    assert len == s2.len
+    newSeq(result, len)
+    for i in 0 ..< len:
+        if s2[i] != 0:
+            result[i] = s1[i] / s2[i]
+        else:
+            result[i] = 0
+
+proc `/`(s: openArray[int|float], v: int|float): seq[float] =
+    assert v != 0
+    let d = float(v)
+    newSeq(result, s.len)
+    for i in 0 ..< s.len:
+        result[i] = float(s[i]) / d
+
+type
+    region_t = ref object
+        chrom: string
+        start: int
+        stop: int
+        name: string
+        count: int
+
+# from https://github.com/brentp/hts-nim-tools
+proc bed_line_to_region(line: string): region_t =
+    var
+        cse = line.strip().split('\t', 5)
+
+    if len(cse) < 3:
+        stderr.write_line("skipping bad bed line:", line.strip())
+        return nil
+    var
+        s = parse_int(cse[1])
+        e = parse_int(cse[2])
+        reg = region_t(chrom: cse[0], start: s, stop: e, count:0)
+    if len(cse) > 3:
+        reg.name = cse[3]
+    return reg
+
+# from https://github.com/brentp/hts-nim-tools
+proc bed_to_table(bed: string): TableRef[string, seq[region_t]] =
+    var bed_regions = newTable[string, seq[region_t]]()
+    var hf = hts.hts_open(cstring(bed), "r")
+    var kstr: hts.kstring_t
+    kstr.l = 0
+    kstr.m = 0
+    kstr.s = nil
+    while hts_getline(hf, cint(10), addr kstr) > 0:
+        if ($kstr.s).startswith("track "):
+            continue
+        if $kstr.s[0] == "#":
+            continue
+        var v = bed_line_to_region($kstr.s)
+        if v == nil: continue
+        discard bed_regions.hasKeyOrPut(v.chrom, new_seq[region_t]())
+        bed_regions[v.chrom].add(v)
+
+    # since it is read into mem, can also well sort.
+    for chrom, ivs in bed_regions.mpairs:
+        sort(ivs, proc (a, b: region_t): int = a.start - b.start)
+
+    hts.free(kstr.s)
+    return bed_regions
+
+proc readIndex(fs: FileStream, sizes: var seq[seq[int]], total_bytes: var seq[int], total_reads: var seq[int]) =
+    # read index and set sizes, total_bytes, and total_reads
+    discard fs.readStr(4)
+    var n_ref = fs.readInt32()
+    for chr in 0..<n_ref:
+        sizes.add(newSeq[int]())
+        var n_bin = fs.readInt32()
+        for bins in 0..<n_bin:
+            var bin = fs.readUint32()
+            var n_chunk = fs.readInt32()
+            for chunk in 0..<n_chunk:
+                var chunk_beg = fs.readUint64()
+                var chunk_end = fs.readUint64()
+                if bin == 37450 and chunk == 1:
+                    total_reads.add(int(chunk_beg) + int(chunk_end))
+        if total_reads.len < chr+1:
+            total_reads.add(0)
+        var n_intv = fs.readInt32()
+        var offset = 0
+        for intv in 0..<n_intv:
+            var ioffset = fs.readUint64()
+            if offset != 0:
+                sizes[chr].add(int(ioffset) - int(offset))
+            offset = int(ioffset)
+        total_bytes.add(sum(sizes[chr]))
+    var n_no_coor = fs.readUint64()
+    fs.close()
+
+proc splitRegion(chrom: string, contig: int, start_pos: int, end_pos: int, sizes: seq[seq[int]], count: int, bytes_per_read: float, out_bed: File) =
+    # split a region and write to file
+    var pos = start_pos
+    var counter = 0.0
+    let start_chunk = int(start_pos / chunk_size)
+    for i,chunk in sizes[contig][int(start_pos / chunk_size)..int(end_pos / chunk_size)]:
+        let chunk_ind = int(start_pos / chunk_size) + i
+        let est_reads = float(chunk)/bytes_per_read
+        counter += est_reads
+        if counter > float(count):
+            echo chrom & ":" & $pos & "-" & $((chunk_ind+1)*chunk_size - 1) & "\t" & $counter
+            out_bed.writeLine(chrom & "\t" & $pos & "\t" & $((chunk_ind+1)*chunk_size - 1) & "\t" & $counter)
+            pos = (chunk_ind+1)*chunk_size
+            counter = 0.0
+    echo chrom & ":" & $pos & "-" & $end_pos & "\t" & $counter
+    out_bed.writeLine(chrom & "\t" & $pos & "\t" & $end_pos & "\t" & $counter)
+
+proc mergeRegions(regions: TableRef[string, seq[region_t]]): TableRef[string, seq[region_t]] =
+    # adapted from https://www.geeksforgeeks.org/merging-intervals/
+    # we already know the structure is sorted
+    result = newTable[string, seq[region_t]]()
+    for chrom, intervals in regions.pairs():
+        # put first interval in stack
+        result[chrom] = @[intervals[0]]
+        if intervals.len > 1:
+            # start from next interval and merge if needed
+            for interval in intervals[1..intervals.len-1]:
+                # get last element
+                let top = result[chrom][result[chrom].len-1]
+                # if not overlapping add to stack
+                if top.stop < interval.start:
+                    result[chrom].add(interval)
+                # else update stop if current greater than top
+                elif top.stop < interval.stop:
+                    top.stop = interval.stop
+                    discard result[chrom].pop()
+                    result[chrom].add(top)
+
+
+proc main() =
+    let args = docopt(doc, version = "0.1.0")
+    echo "BED: ", $args["<BED>"]
+    echo "BAM: ", $args["<BAM>"]
+    echo "OUT: ", $args["<OUT>"]
+    echo "Count: ", $args["--count"], " reads"
+    echo ""
+
+    var
+        bam:Bam
+        sizes = newSeq[seq[int]]() # matrix of bytes per chunk per contig
+        total_bytes = newSeq[int]() # bytes per contig
+        total_reads = newSeq[int]() # reads per contig
+        regions = bed_to_table($args["<BED>"])
+        idx_path: string
+
+    let
+        time = now()
+        split = parseInt($args["--count"])
+        out_bed = open($args["<OUT>"], fmWrite)
+        bam_path = $args["<BAM>"]
+
+    open(bam, bampath, index=true)
+
+    # find index
+    if os.fileExists(bam_path & ".bai"):
+        idx_path = bam_path & ".bai"
+    elif os.fileExists(bam_path.replace(".bam",".bai")):
+        idx_path = bam_path.replace(".bam",".bai")
+    else:
+        raise newException(ValueError, "Bam index could not be found")
+
+    # resolve overlaps
+    if args["--merge"]:
+        regions = mergeRegions(regions)
+
+    # read index
+    let idx = newFileStream(idx_path, fmRead)
+    readIndex(idx, sizes, total_bytes, total_reads)
+
+    # calculate average bytes per read
+    let bytes_per_read = sum(total_bytes/total_reads)/float((total_bytes/total_reads).len)
+
+    # loop through regions and split
+    for target in targets(bam.hdr):
+        var chrom = target.name
+        if not regions.contains(chrom) or regions[chrom].len == 0:
+            continue
+        for region in regions[chrom]:
+            splitRegion(chrom, target.tid, region.start, region.stop, sizes, split, bytes_per_read, out_bed)
+    echo ""
+    echo "Elapsed time: " & $(now() - time)
+    close(bam)
+    close(out_bed)
+
+when isMainModule:
+  main()