Merge pull request #57 from quinlan-lab/dev-merge

Candidate for release v0.2.0 - fixed merge conflicts
quinlan-lab · May 5, 2020 · f09fbdc · f09fbdc
2 parents 614a488 + fc73fe9
commit f09fbdc
Show file tree

Hide file tree

Showing 22 changed files with 443 additions and 206 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,10 @@
 str
 tests/test_utils
 tests/test_strling
+src/strpkg/spanning
 tests/all
+src/strpkg/unpack
+src/strpkg/utils
+src/strpkg/simulate_reads
+src/strling
+*.swp
diff --git a/README.md b/README.md
@@ -8,103 +8,8 @@ STRling is still in development. Please report bugs via GitHub issues.
 
 STRling (pronounced like “sterling”) is a method to detect large STR expansions from short-read sequencing data. It is capable of detecting novel STR expansions, that is expansions where there is no STR in the reference genome at that position (or a different repeat unit from what is in the reference). It can also detect STR expansions that are annotated in the reference genome. STRling uses kmer counting to recover mis-mapped STR reads. It then uses soft-clipped reads to precisely discover the position of the STR expansion in the reference genome.
 
-## Install
-
-We recommending downloading the static binary.
-
-Download `strling` from the latest release from [here](https://github.com/quinlan-lab/STRling/releases/latest).
-
-Make it executable:  
-`chmod +x strling`
-
-### Install from source
-
-Install nim:  
-`curl https://nim-lang.org/choosenim/init.sh -sSf > init.sh && sh init.sh`
-
-Install STRling:  
-```
-git clone <URL>
-cd STRling
-nimble install
-```
-
-Compile options for development:  
-
-Compile in fast mode (danger):  
-`nim c -d:danger -d:release src/strling.nim`
-
-## Run
-
-### Single sample
-
-#### extract informative pairs to a binary format
-```
-name=hg002
-strling extract -f $reference_fasta /path/to/$name.cram $name.bin
-```
-
-#### call strs on the extract binary data
-
-```
-mkdir -p str-results/
-strling call --output-prefix str-results/$name -f $reference_fasta /path/to/$name.cram $name.bin
-```
-
-### Joint calling
-
-#### extract informative pairs to a binary format for a single sample (same as above)
-```
-strling extract -f $reference_fasta /path/to/$sample2.cram $sample1.bin
-strling extract -f $reference_fasta /path/to/$sample2.cram $sample2.bin
-```
-
-#### joint call call str loci accross all samples
-
-```
-mkdir -p str-results/
-strling merge --output-prefix str-results/joint -f $reference_fasta $sample1.bin $sample2.nim
-```
-
-#### call strs on a single sample
-
-```
-strling call --output-prefix str-results/$sample1 -b str-results/joint-bounds.txt -f $reference_fasta /path/to/$sample1.cram $sample1.bin
-strling call --output-prefix str-results/$sample2 -b str-results/joint-bounds.txt -f $reference_fasta /path/to/$sample2.cram $sample2.bin
-```
-
-## Outputs
-
-The main output file is `{$prefix}-genotype.txt`. It reports all STR expansion loci that pass thresholds as well as any provided as input. The columns are:
-- chrom: chromosome/contig name
-- left: predicted left boundry of STR locus
-- right: predicted right boundry of STR locus
-- repeatunit: predicted STR repeat unit
-- allele1\_est: estimated size of the shorter allele in repeat units relative to the reference, from spanning reads (if any). "na" indicats no reads support an allele shorter than the read length, so both may be large.
-- allele2\_est: estimated size of the larger allele in repeat units relative to the reference, from anchored reads
-- total\_reads: number of reads supporting an expansion at this locus
-- spanning\_reads: number of reads that span the locus
-- spanning\_pairs: number of read pairs that span the locus
-- left\_clips: number of soft-clipped reads supporting the left side of the locus position
-- right\_clips: number of soft-clipped reads supporting the right side of the locus position
-- unplaced\_pairs: number of unplaced STR reads assigned to this locus (will only be >0 for a uniquely expanded repeat unit)
-- depth: local median depth around the locus
-- sum\_str\_counts: the sum of STR repeat units in all reads supporting an expansion
-
-Some additional outputs are provided with detailed supporting evidence used to make the genotype calls:
-- Putative str bounds: `{$prefix}-bounds.txt`
-- All str-like reads: `{$prefix}-reads.txt`
-- Spanning reads and spanning pairs:`{$prefix}-spanning.txt`
-- Counts of str-like reads that are unplaced (could not be assigned to a locus): `{$prefix}-unplaced.txt`
-
-
-## Run tests
-`nimble tests`
-
-If you get the error:
-`could not load: libhts.so`
-
-try:  
-`export LD_LIBRARY_PATH=./htslib/`
+## Install and Run STRling
 
+Download the latest release of `strling` from [here](https://github.com/quinlan-lab/STRling/releases/latest).
 
+Please see the [STRling Documentation](https://strling.readthedocs.io/en/latest/) for running instructions.
diff --git a/docs/source/run.rst b/docs/source/run.rst
@@ -26,21 +26,21 @@ Call strs on the extract binary data
 Joint calling
 -------------
 
-Extract informative pairs to a binary format for a single sample (same as above)
+Extract informative pairs to a binary format for a single sample (same as above, you can use the same bin files)
 
 .. code-block:: bash
 
   strling extract -f $reference_fasta /path/to/$sample2.cram $sample1.bin
   strling extract -f $reference_fasta /path/to/$sample2.cram $sample2.bin
 
-Joint call call str loci accross all samples
+Joint call str loci across all samples
 
 .. code-block:: bash
 
   mkdir -p str-results/
-  strling merge --output-prefix str-results/joint -f $reference_fasta $sample1.bin $sample2.nim
+  strling merge --output-prefix str-results/joint -f $reference_fasta $sample1.bin $sample2.bin
 
-Call strs on a single sample
+Call estimate allele sizes for each sample
 
 .. code-block:: bash
 

diff --git a/pipelines/README.md b/pipelines/README.md
@@ -0,0 +1,11 @@
+# Bpipe pipelines
+
+Place the strling executable in your PATH, or set `-p STRLING=/path/to/strling`
+
+## Individual calling
+`bg-bpipe run -p REF=/path/to/ref-genome.fasta strling-individual.groovy *.cram`
+
+## Joint calling
+`bg-bpipe run -p REF=/path/to/ref-genome.fasta strling-joint.groovy *.cram`
+
+Note that nim does not play nicely with bpipe's ctl-C handing, so have to use the background bpipe command `bg-bpipe` instead of the regular `bpipe`.
diff --git a/pipelines/bpipe.config b/pipelines/bpipe.config
@@ -0,0 +1,17 @@
+//executor="slurm" 
+executor="local" 
+
+procs="1" 
+walltime="24:00:00"
+memory="16"
+
+commands {
+    local {
+        executor="local" 
+    }
+    strling {
+        walltime="04:00:00"
+        memory="8"
+    }
+
+}
diff --git a/pipelines/pipeline-stages.groovy b/pipelines/pipeline-stages.groovy
@@ -0,0 +1,67 @@
+STRLING='strling'
+
+if(args.any { it.endsWith('.cram') })
+    input_type = 'cram'
+else
+    input_type='bam'
+
+def get_fname(path) {
+    def x = path.split("/")[-1]
+    return(x)
+}
+
+str_extract = {
+    def sample = branch.name
+    def str_ref = get_fname(REF) + ".str"
+    produce(sample + ".str.bin") {
+        exec """
+            $STRLING extract
+                -f $REF
+                -g $str_ref
+                ${input[input_type]}
+                $output.bin
+        ""","strling"
+    }
+}
+
+str_merge = {
+    from("*.bin") produce("strling-bounds.txt") {
+        exec """
+            $STRLING merge
+                -f $REF
+                $inputs.bin
+        ""","strling"
+    }
+}
+
+str_call_individual = {
+    def sample = branch.name.prefix
+    from (sample + "*str.bin", sample + "*" + input_type) produce(
+            sample + "-bounds.txt", sample + "-unplaced.txt",
+            sample + "-genotype.txt") {
+        exec """
+            $STRLING call
+                -f $REF
+                -o $sample
+                ${input[input_type]}
+                $input.bin
+        ""","strling"
+    }
+}
+
+str_call_joint = {
+    def sample = branch.name.prefix
+    from (sample + "*str.bin", sample + "*" + input_type, "strling-bounds.txt") produce(
+            sample + "-bounds.txt", sample + "-unplaced.txt",
+            sample + "-genotype.txt") {
+        exec """
+            $STRLING call
+                -f $REF
+                -b $input.txt
+                -o $sample
+                ${input[input_type]}
+                $input.bin
+        ""","strling"
+    }
+}
+
diff --git a/pipelines/strling-individual.groovy b/pipelines/strling-individual.groovy
@@ -0,0 +1,9 @@
+// Load system configuration and other settings
+//load 'pipeline-config.groovy'
+
+// Load Bpipe pipeline stages
+load 'pipeline-stages.groovy'
+
+run {
+    "%.${input_type}" * [str_extract + str_call_individual]
+}
diff --git a/pipelines/strling-joint.groovy b/pipelines/strling-joint.groovy
@@ -0,0 +1,11 @@
+// Load system configuration and other settings
+//load 'pipeline-config.groovy'
+
+// Load Bpipe pipeline stages
+load 'pipeline-stages.groovy'
+
+run {
+    "%.${input_type}" * [str_extract] +
+        str_merge +
+    "%.bin" * [str_call_joint] //+ str_outlier
+}
diff --git a/sim/combine_random_sim_results.py b/sim/combine_random_sim_results.py
@@ -99,27 +99,25 @@ def main():
     parse_bounds(boundsfiles).to_csv(args.out + '-bounds.csv', index=False)
 
     spansfiles = glob.glob(args.str_dir+"/*-spanning.txt")
-    if len(spansfiles) == 0:
-        sys.exit('ERROR: No -spanning.txt files found in the given directory: ' + args.str_dir)
-    parse_spans(spansfiles).to_csv(args.out + '-spans.csv', index=False)
+    if len(spansfiles) > 0:
+        parse_spans(spansfiles).to_csv(args.out + '-spans.csv', index=False)
 
     unplacedfiles = glob.glob(args.str_dir+"/*-unplaced.txt")
     if len(unplacedfiles) == 0:
         sys.exit('ERROR: No -unplaced.txt files found in the given directory: ' + args.str_dir)
     parse_unplaced(unplacedfiles).to_csv(args.out + '-unplaced.csv', index=False)
 
     readsfiles = glob.glob(args.str_dir+"/*-reads.txt")
-    if len(readsfiles) == 0:
-        sys.exit('ERROR: No -reads.txt files found in the given directory: ' + args.str_dir)
-    all_df_reads = []
-    for f in readsfiles:
-        df_reads = pd.read_csv(f, sep='\t')
-        df_reads.rename(columns={"#chrom": "chrom"}, inplace=True)
-        df_reads['sim'] = get_sim_str(f)
-        all_df_reads.append(df_reads)
-    all_df = pd.concat(all_df_reads)
-    all_df = all_df.merge(bed_df, how='left', on='sim')
-    all_df.to_csv(args.out + '-results.csv', index=False)
+    if len(readsfiles) > 0:
+        all_df_reads = []
+        for f in readsfiles:
+            df_reads = pd.read_csv(f, sep='\t')
+            df_reads.rename(columns={"#chrom": "chrom"}, inplace=True)
+            df_reads['sim'] = get_sim_str(f)
+            all_df_reads.append(df_reads)
+        all_df = pd.concat(all_df_reads)
+        all_df = all_df.merge(bed_df, how='left', on='sim')
+        all_df.to_csv(args.out + '-results.csv', index=False)
 
 if __name__ == "__main__":
     main()
diff --git a/sim/sim_shared.groovy b/sim/sim_shared.groovy
@@ -146,8 +146,7 @@ str_call = {
     def sample = branch.name.prefix
 
     from (sample + ".str.bin", sample + ".bam", "strling-bounds.txt") produce(
-            sample + "-reads.txt", sample + "-bounds.txt",
-            sample + "-spanning.txt", sample + "-unplaced.txt",
+            sample + "-bounds.txt", sample + "-unplaced.txt",
             sample + "-genotype.txt") {
 
         def bamname = get_fname(input.bam)

diff --git a/sim/simulate_random.groovy b/sim/simulate_random.groovy
@@ -13,11 +13,11 @@ mutate_locus = {
     produce('*.bed') {
         exec """
             $PYTHON $TOOLS/random_str_alleles.py
-                --locus "4   3076604 3076695 CAG"
+                --locus "7 117143769 117143769 CAG"
                 --out $dir/
-                --num 100
-                --min -5
-                --max 200
+                --num 300
+                --min 0
+                --max 600
                 --fixed 0
                 --seed 7
         """
@@ -36,15 +36,15 @@ simulate_str_reads = {
         ~/storage/git/STRling/src/strpkg/simulate_reads
             --fasta $REF
             --output $output.prefix
-            $input.hist
+            $input.cram
             $input.bed
     """
 }
 
 combine = {
 
         exec """
-            $PYTHON $TOOLS/combine_random_sim_results.py --bed_dir sim --str_dir str --out HTT
+            $PYTHON $TOOLS/combine_random_sim_results.py --bed_dir sim --str_dir str --out sim
         """
 }
 

diff --git a/src/strling.nim b/src/strling.nim
@@ -6,6 +6,7 @@ import ./strpkg/extract
 import ./strpkg/merge
 import ./strpkg/call
 import ./strpkg/version
+import ./strpkg/extract_region
 
 proc main*() =
 
@@ -17,6 +18,7 @@ proc main*() =
     "extract": pair(f:extract_main, description:"extract informative STR reads from a BAM/CRAM. This is a required first step."),
     "merge": pair(f:merge_main, description:"merge putitive STR loci from multiple samples. Only required for joint calling."),
     "call": pair(f:call_main, description:"call STRs"),
+    "pull_region": pair(f:extract_region_main, description:"for debugging; pull all reads (and mates) for a given regions"),
     }.toOrderedTable
   var args = commandLineParams()