diff --git a/.gitignore b/.gitignore index 1e830bf..c95bf93 100755 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ examples/ hdf5-1.8.14/ tensorflow/ hdf5-1.8.14.tar.gz +htslib/ #generated .depend @@ -26,8 +27,9 @@ hdf5-1.8.14.tar.gz .pydevproject .settings/* -#commit log file +#commit log file and path src/gitcommit.h +src/softwarepath.h #plots *.png diff --git a/Makefile b/Makefile index 7b731a1..eea974c 100755 --- a/Makefile +++ b/Makefile @@ -1,30 +1,43 @@ CC = gcc CXX = g++ DEBUG = -g -LIBFLAGS = +LIBFLAGS = -lrt +LDFLAGS ?= -ldl -llzma -lbz2 -lm -lz CXXFLAGS = -Wall -O2 -fopenmp -std=c++14 CFLAGS = -Wall -std=c99 -O2 +SPACE:= ; +SPACE+=; +null := +space := ${null} ${null} +${space} := ${space} + +CURRENT_PATH := $(subst $(lastword $(notdir $(MAKEFILE_LIST))),,$(subst $(SPACE),\$(SPACE),$(shell realpath '$(strip $(MAKEFILE_LIST))'))) +PATH_SPACEFIX := $(subst ${space},\${space},${CURRENT_PATH}) + +ifeq ($(zstd),1) + LDFLAGS += -lzstd +endif + #hdf5 H5_LIB = ./hdf5-1.8.14/hdf5/lib/libhdf5.a H5_INCLUDE = -I./hdf5-1.8.14/hdf5/include -LIBFLAGS += -Wl,-rpath,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))hdf5-1.8.14/hdf5/lib -L hdf5-1.8.14/hdf5/lib -lhdf5 #hts HTS_LIB = ./htslib/libhts.a HTS_INCLUDE = -I./htslib -LIBFLAGS += -Wl,-rpath,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))htslib -L htslib/ -lhts #tensorflow -TENS_LIB = ./tensorflow/include/tensorflow/c/c_api.h +TENS_DEPEND = tensorflow/include/tensorflow/c/c_api.h +TENS_LIB = -Wl,-rpath,${PATH_SPACEFIX}tensorflow/lib -L tensorflow/lib TENS_INCLUDE = -I./tensorflow/include -LIBFLAGS += -Wl,-rpath,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))tensorflow/lib -L tensorflow/lib -ltensorflow +LIBFLAGS = -ltensorflow #fast5 FAST5_INCLUDE = -I./fast5/include #add include flags for each library -CXXFLAGS += $(H5_INCLUDE) $(HTS_INCLUDE) $(FAST5_INCLUDE) $(TENS_INCLUDE) +CPPFLAGS += $(H5_INCLUDE) $(HTS_INCLUDE) $(FAST5_INCLUDE) $(TENS_INCLUDE) MAIN_EXECUTABLE = bin/DNAscent @@ -47,12 +60,12 @@ tensorflow/include/tensorflow/c/c_api.h: if [ ! -e tensorflow/include/tensorflow/c/c_api.h ]; then \ mkdir tensorflow; \ cd tensorflow; \ - wget https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-linux-x86_64-1.15.0.tar.gz; \ - tar -xzf libtensorflow-gpu-linux-x86_64-1.15.0.tar.gz || exit 255; \ + wget https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-linux-x86_64-2.4.1.tar.gz; \ + tar -xzf libtensorflow-gpu-linux-x86_64-2.4.1.tar.gz || exit 255; \ cd ..; \ fi -SUBDIRS = src src/scrappie src/pfasta +SUBDIRS = src src/scrappie src/pfasta src/sgsmooth CPP_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.cpp)) C_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.c)) EXE_SRC = src/DNAscent.cpp @@ -61,27 +74,31 @@ EXE_SRC = src/DNAscent.cpp src/gitcommit.h: .git/HEAD .git/index echo "const char *gitcommit = \"$(shell git rev-parse HEAD)\";" > $@ +#log the software path +src/softwarepath.h: + echo "const char *executablePath = \"${PATH_SPACEFIX}\";" > $@ + #generate object names CPP_OBJ = $(CPP_SRC:.cpp=.o) C_OBJ = $(C_SRC:.c=.o) depend: .depend -.depend: $(CPP_SRC) $(C_SRC) $(EXE_SRC) $(H5_LIB) $(TENS_LIB) src/gitcommit.h +.depend: $(CPP_SRC) $(C_SRC) $(EXE_SRC) $(H5_LIB) $(TENS_DEPEND) src/gitcommit.h src/softwarepath.h rm -f ./.depend - $(CXX) $(CXXFLAGS) -MM $(CPP_SRC) $(C_SRC) > ./.depend; + $(CXX) $(CXXFLAGS) $(CPPFLAGS) -MM $(CPP_SRC) $(C_SRC) > ./.depend; #compile each object -.cpp.o: src/gitcommit.h - $(CXX) -o $@ -c $(CXXFLAGS) -fPIC $< +.cpp.o: src/gitcommit.h src/softwarepath.h + $(CXX) -o $@ -c $(CXXFLAGS) $(CPPFLAGS) -fPIC $< .c.o: - $(CC) -o $@ -c $(CFLAGS) $(H5_INCLUDE) -fPIC $< + $(CC) -o $@ -c $(CFLAGS) $(CPPFLAGS) $(H5_INCLUDE) -fPIC $< #compile the main executable -$(MAIN_EXECUTABLE): src/DNAscent.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(TENS_LIB) src/gitcommit.h - $(CXX) -o $@ $(CXXFLAGS) -fPIC $(CPP_OBJ) $(C_OBJ) $(LIBFLAGS) +$(MAIN_EXECUTABLE): src/DNAscent.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(TENS_DEPEND) src/gitcommit.h src/softwarepath.h + $(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(TENS_LIB) $(LIBFLAGS) $(LDFLAGS) clean: - rm -f $(MAIN_EXECUTABLE) $(CPP_OBJ) $(C_OBJ) src/DNAscent.o gitcommit.h + rm -f $(MAIN_EXECUTABLE) $(CPP_OBJ) $(C_OBJ) src/DNAscent.o src/gitcommit.h src/softwarepath.h diff --git a/README.md b/README.md index d04cedb..95ec1aa 100755 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ git clone --recursive https://github.com/MBoemo/DNAscent.git The DNAscent directory will appear in your current directory. Switch to the latest tagged version and compile the software by running: ```shell cd DNAscent -git checkout 2.0.2 +git checkout 3.0.2 make ``` This will put the DNAscent executable into the DNAscent/bin directory. A typical compile time for DNAscent and its dependencies is 5 minutes. @@ -21,6 +21,7 @@ Please see the [documentation](https://dnascent.readthedocs.io) for detailed usa ## Citation Please cite the following if you use DNAscent for your research: +- Totanes FIG, Gockel J, Chapman SE, Bartfai R, Boemo MA, Merrick CJ. Replication origin mapping in the malaria parasite Plasmodium falciparum. bioRxiv. [[bioRxiv](https://doi.org/10.1101/2022.07.27.501677)] - Boemo, MA. DNAscent v2: Detecting replication forks in nanopore sequencing data with deep learning. *BMC Genomics* 2021;22:430. [[Journal Link](https://doi.org/10.1186/s12864-021-07736-6)] - Muller CA, Boemo MA, Spingardi P, Kessler BM, Kriaucionis S, Simpson JT, Nieduszynski CA. Capturing the dynamics of genome replication on individual ultra-long nanopore sequence reads. *Nature Methods* 2019;16:429-436. [[Journal Link](https://www.nature.com/articles/s41592-019-0394-y)] diff --git a/dnn_models/BrdU_detect.pb b/dnn_models/BrdU_detect.pb deleted file mode 100644 index 1594a61..0000000 Binary files a/dnn_models/BrdU_detect.pb and /dev/null differ diff --git a/dnn_models/detect_model_BrdUEdU/saved_model.pb b/dnn_models/detect_model_BrdUEdU/saved_model.pb new file mode 100644 index 0000000..5976c64 Binary files /dev/null and b/dnn_models/detect_model_BrdUEdU/saved_model.pb differ diff --git a/dnn_models/detect_model_BrdUEdU/variables/variables.data-00000-of-00001 b/dnn_models/detect_model_BrdUEdU/variables/variables.data-00000-of-00001 new file mode 100644 index 0000000..a44a069 Binary files /dev/null and b/dnn_models/detect_model_BrdUEdU/variables/variables.data-00000-of-00001 differ diff --git a/dnn_models/detect_model_BrdUEdU/variables/variables.index b/dnn_models/detect_model_BrdUEdU/variables/variables.index new file mode 100644 index 0000000..08a6f91 Binary files /dev/null and b/dnn_models/detect_model_BrdUEdU/variables/variables.index differ diff --git a/dnn_models/forkSense.pb b/dnn_models/forkSense.pb deleted file mode 100644 index 1fc959a..0000000 Binary files a/dnn_models/forkSense.pb and /dev/null differ diff --git a/docs/source/base.rst b/docs/source/base.rst new file mode 100644 index 0000000..9f72094 --- /dev/null +++ b/docs/source/base.rst @@ -0,0 +1,46 @@ +.. DNAscent documentation master file, created by + sphinx-quickstart on Fri Feb 7 18:58:49 2020. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +DNAscent +==================================== + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + installation + index + detect + forkSense + visualisation + workflows + cookbook + releaseNotes + +Overview +-------- + +DNAscent is software designed to detect the modified bases BrdU and EdU in Oxford Nanopore reads. In an experimental setup where BrdU and EdU are incorporated into nascent DNA by replication forks, this software can be used to answer questions that were traditionally answered by DNA fibre analysis. + +At present, the only Oxford Nanopore flow cells supported by DNAscent are R9.4.1. The Flongle, MinION, GridION, and PromethION platforms are all supported. + +DNAscent is under active development by the `Boemo Group `_ based in the `Department of Pathology, University of Cambridge `_. We aim to push regular updates and improvements, and incorporating new functionality is an active area of our computational research. + + +Publications +------------ + +If you use DNAscent for your research, please cite the following publications: + +Totanes FIG, Gockel J, Chapman SE, Bartfai R, Boemo MA, Merrick CJ. Replication origin mapping in the malaria parasite Plasmodium falciparum. [`bioRxiv `_] + +Boemo, MA DNAscent v2: Detecting replication forks in nanopore sequencing data with deep learning. BMC Genomics 2021;22:430. [`Journal DOI `_] + +Muller CA, Boemo MA, Spingardi P, Kessler, BM, Kriaucionis S, Simpson JT, Nieduszynski CA. Capturing the dynamics of genome replication on individual ultra-long nanopore sequence reads. Nature Methods 2019;16:429-436. [`Journal DOI `_] + +Bugs, Questions, and Comments +----------------------------- + +Should any bugs arise or if you have any questions about usage, please raise a `GitHub issue `_. If you have comments or suggestions to improve the software or the documentation, please Email Michael Boemo at mb915@cam.ac.uk. diff --git a/docs/source/conf.py b/docs/source/conf.py index fe98750..6949ed5 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -18,11 +18,11 @@ # -- Project information ----------------------------------------------------- project = 'DNAscent' -copyright = '2020, Michael A. Boemo' +copyright = '2022, Michael A. Boemo' author = 'Michael A. Boemo' # The full version, including alpha/beta/rc tags -release = '2.0.2' +release = '3.0.2' # -- General configuration --------------------------------------------------- @@ -41,7 +41,7 @@ # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] -master_doc = 'index' +master_doc = 'base' # -- Options for HTML output ------------------------------------------------- @@ -53,4 +53,4 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = [] +html_static_path = ['_static'] diff --git a/docs/source/cookbook.rst b/docs/source/cookbook.rst index f3dc53a..56e7702 100644 --- a/docs/source/cookbook.rst +++ b/docs/source/cookbook.rst @@ -29,73 +29,51 @@ The following barebones script parses the output of ``DNAscent detect``. We ite strand = splitLine[4] else: posOnRef = int(splitLine[0]) - probBrdU = float(splitLine[1]) - sixMerOnRef = splitLine[2] + probEdU = float(splitLine[1]) + probBrdU = float(splitLine[2]) + sixMerOnRef = splitLine[3] #add these values to a container or do some processing here f.close() -The following barebones script parses the output of ``DNAscent forkSense``. Note the similarity to the above script: All DNAscent output files were designed to have a very similar format to aid user processing. -.. code-block:: python - - f = open('path/to/output.forkSense','r') - - for line in f: - - #ignore the header lines - if line[0] == '#': - continue - - #split the line into a list by whitespace - splitLine = line.rstrip().split() - - if line[0] == '>': - - readID = splitLine[0][1:] - chromosome = splitLine[1] - refStart = int(splitLine[2]) - refEnd = int(splitLine[3]) - strand = splitLine[4] - else: - posOnRef = int(splitLine[0]) - probLeftFork = float(splitLine[1]) - probRightFork = float(splitLine[2]) - - #add these values to a container or do some processing here - - f.close() - -And again for ``DNAscent regions``: +The following example plots a histogram of fork track lengths from bed files generated by DNAscent forkSense. .. code-block:: python - f = open('path/to/output.regions','r') + import matplotlib + from matplotlib import pyplot as plt - for line in f: + fnames = ['leftForks_DNAscent_forkSense.bed','rightForks_DNAscent_forkSense.bed'] - #ignore the header lines - if line[0] == '#': - continue - - #split the line into a list by whitespace - splitLine = line.rstrip().split() + forkLengths = [] - if line[0] == '>': + for fn in fnames: + f = open(fn,'r') - readID = splitLine[0][1:] - chromosome = splitLine[1] - refStart = int(splitLine[2]) - refEnd = int(splitLine[3]) - strand = splitLine[4] - else: - regionStart = int(splitLine[0]) - regionEnd = int(splitLine[1]) - regionScore = float(splitLine[2]) + for line in f: - #add these values to a container or do some processing here + #ignore the header lines + if line[0] == '#': + continue + + #split the line into a list by whitespace + splitLine = line.rstrip().split() + + lbound = int(splitLine[1]) + rbound = int(splitLine[2]) + + forkLengths.append(rbound-lbound) - f.close() + f.close() + plt.figure() + plt.hist(forkLengths) + plt.xlabel('Fork Track Length (bp)') + plt.ylabel('Count') + plt.savefig('forkTrackLen.pdf') + plt.close() + + diff --git a/docs/source/detect.rst b/docs/source/detect.rst index e77af6c..84d01ae 100644 --- a/docs/source/detect.rst +++ b/docs/source/detect.rst @@ -1,9 +1,9 @@ .. _detect: -Detect +detect =============================== -``DNAscent detect`` is a ``DNAscent`` subprogram that goes through each read and, at each thymidine position, assigns the probability the thymidine is BrdU. +``DNAscent detect`` is a ``DNAscent`` subprogram that analyses each nanopore-sequenced read and, at each thymidine position, assigns the probability that the base is really BrdU and EdU. Usage ----- @@ -21,15 +21,15 @@ Usage -t,--threads number of threads (default is 1 thread), --GPU use the GPU device indicated for prediction (default is CPU), -q,--quality minimum mapping quality (default is 20), - -l,--length minimum read length in bp (default is 100). + -l,--length minimum read length in bp (default is 1000). The main input of ``DNAscent detect`` is an alignment (bam file) between the sequence fastq from Guppy and the organism's reference genome. This bam file should be sorted using ``samtools sort`` and indexed using ``samtools index`` so that there is a .bam.bai file in the same directory as the bam file. (Please see the example in :ref:`workflows` for details on how to do this.) The full path to the reference genome used in the alignment should be passed using the ``-r`` flag, and the index required by the ``-i`` flag is the file created using ``DNAscent index`` (see :ref:`index`). -The number of threads is specified using the ``-t`` flag. ``DNAscent detect`` multithreads quite well by analysing a separate read on each thread, so multithreading is recommended. By default, the signal alignments and ResNet BrdU predictions are run on CPUs. If a CUDA-compatible GPU device is specified using the ``--GPU`` flag, then the signal alignments will be run on CPUs using the threads specified with ``-t`` and the ResNet BrdU prediction will be run on the GPU. Your GPU device number can be found with the command ``nvidia-smi``. GPU use requires that CUDA and cuDNN are set up correctly on your system and that these libraries can be accessed. If they're not, DNAscent will default back to using CPUs. +The number of threads is specified using the ``-t`` flag. ``DNAscent detect`` multithreads quite well by analysing a separate read on each thread, so multithreading is recommended. By default, the signal alignments and ResNet predictions are run on CPUs. If a CUDA-compatible GPU device is specified using the ``--GPU`` flag, then the signal alignments will be run on CPUs using the threads specified with ``-t`` and the ResNet BrdU and EdU predictions will be run on the GPU. Your GPU device number can be found with the command ``nvidia-smi``. GPU use requires that CUDA and cuDNN are set up correctly on your system and that these libraries can be accessed. If they're not, DNAscent will default back to using CPUs. It is sometimes useful to only run ``DNAscent detect`` on reads that exceed a certain mapping quality or length threshold (as measured by the subsequence of the contig that the read maps to). In order to do this without having to filter the bam file, DNAscent provides the ``-l`` and ``-q`` flags. Any read in the bam file with a reference length lower than the value specificed with ``-l`` or a mapping quality lower than the value specified with ``-q`` will be ignored. -Before calling BrdU in a read, ``DNAscent detect`` must first perform a fast event alignment (see https://www.biorxiv.org/content/10.1101/130633v2 for more details). Quality control checks are performed on these alignments, and if they're not passed, then the read fails and is ignored. Hence, the number of reads in the output file will be slightly lower than the number of input reads. Typical failure rates are about 5-10%, although this will vary slightly depending on the read length, the BrdU substitution rate, and the genome sequenced. +Before calling BrdU and EdU in a read, ``DNAscent detect`` must first perform a fast event alignment (see https://www.biorxiv.org/content/10.1101/130633v2 for more details). Quality control checks are performed on these alignments, and if they're not passed, then the read fails and is ignored. Hence, the number of reads in the output file will be slightly lower than the number of input reads. Typical failure rates are about 5-10%, although this will vary slightly depending on the read length, the analogue substitution rate, and the genome sequenced. Output ------ @@ -46,11 +46,12 @@ Output #Mode CNN #MappingQuality 20 #MappingLength 5000 - #SignalDilation 1.000000 - #Version 2.0.0 + #SystemStartTime 09/06/2022 12:45:29 + #Software /path/to/DNAscent + #Version 3.0.0 #Commit 4cf80a7b89bdf510a91b54572f8f94d3daf9b167 -You can easily access the header of any .detect file with ``head -11 /path/to/output.detect`` or, alternatively, ``grep '#' /path/to/output.detect``. +You can easily access the header of any .detect file with ``head -12 /path/to/output.detect`` or, alternatively, ``grep '#' /path/to/output.detect``. Below the header is data for each read. Note that everything in this output file orients to the reference genome in the 5' --> 3' direction. Each read starts with a line in the format: @@ -64,15 +65,16 @@ These lines always begin with a greater-than (>) character. Therefore, an easy * the read mapped between ``mappingStart`` and ``mappingEnd`` on ``contig``, * ``strand`` either takes the value ``fwd``, indicating that the read mapped to the forward strand, or ``rev`` indicating that the read mapped to the reverse complement strand. -The following shows an example for a read that to the reverse strand between 239248 and 286543 on chrII. +The following shows an example for a read that to the reverse strand between 48490 and 53033 on chromosome 1. .. code-block:: console - >c602f23f-e892-42ba-8140-da949abafbdd chrII 239248 286543 rev + >0d64a203-81b5-4b6c-aa2f-67b20969a509 1 48490 53033 rev -Below these "start of read" lines, each line corresponds to the position of a thymidine in that read. There are three tab-separated columns: +Below these "start of read" lines, each line corresponds to the position of a thymidine in that read. There are four tab-separated columns: * the coordinate on the reference, +* probability that the thymidine is actually EdU, * probability that the thymidine is actually BrdU, * 6mer on the reference. @@ -81,32 +83,29 @@ Consider the following examples: .. code-block:: console - >c6785e1f-10d2-49cb-8ca3-e8d48979001b chrXIII 74003 81176 rev - 74010 0.012874 TCTCTA - 74011 0.012428 CTCTAA - 74014 0.016811 TAACGA - 74017 0.013372 CGACCA - 74018 0.013836 GACCAA + >a4ea2872-9cb6-4218-afad-905f79204eb1 14 992440 996846 rev + 992448 0.125751 0.131483 ATCTTA + 992450 0.082488 0.078428 CTTATA + 992451 0.070718 0.050604 TTATAA + 992453 0.062216 0.047409 ATAACA + 992456 0.056369 0.042582 ACATTA + 992457 0.046755 0.038603 CATTAA + 992459 0.056535 0.041545 TTAATA -Here, we're looking at the sequence TCTCTAACGACCAA on the reference genome. Because this read maps to the reverse complement, a call is made at every A (instead of T) on the reference. If instead we looked at a read that mapped to the forward strand, an example would be: + +Here, we're looking at the sequence ATCTTATAACATTAATA on the reference genome. Because this read maps to the reverse complement, a call is made at every A (instead of T) on the reference. The low probabilities of BrdU and EdU indicate the shown region of this particular molecule is unlikely to have analouge incorporated in it. -.. code-block:: console - - >5d10eb9a-aae1-4db8-8ec6-7ebb34d32575 chrXIII 72319 77137 fwd - 72319 0.017496 TCGTTT - 72322 0.029483 TTTCTG - 72323 0.039008 TTCTGT - 72324 0.031474 TCTGTG - 72326 0.026997 TGTGAG - -In both of these output snippets, we see from the second column that the probability of BrdU is low (around a 1-3% chance of BrdU) so these few bases are likely from a BrdU-negative region of DNA. In contrast, here we see the start of a read that does contain BrdU, and accordingly, the probability of BrdU at some positions is much higher: +If instead we looked at a read that mapped to the forward strand, an example would be: .. code-block:: console - >a4f36092-b4d5-47a9-813e-c22c3b477a0c chrXVI 899273 907581 fwd - 899276 0.866907 TCAAAT - 899281 0.947935 TCCACA - 899300 0.014683 TGGGAG - 899312 0.186812 TAACGG - 899320 0.934850 TTATTG + >d1e97c0f-5de7-4249-a426-30d5b4334106 2 748326 761207 fwd + 748327 0.066334 0.084699 TTTAGA + 748328 0.045942 0.152147 TTAGAA + 748329 0.040509 0.187831 TAGAAA + 748336 0.028645 0.278245 TCGGAC + 748352 0.021041 0.922350 TCGAAT + 748357 0.017188 0.932314 TGTAAT + 748359 0.016415 0.921368 TAATAT +Here, we have a few genomic positions with high probability BrdU calls, indicating that the shown region of this molecule may be BrdU-substituted. diff --git a/docs/source/forkSense.rst b/docs/source/forkSense.rst index a328403..b059329 100644 --- a/docs/source/forkSense.rst +++ b/docs/source/forkSense.rst @@ -3,7 +3,7 @@ forkSense =============================== -``DNAscent forkSense`` is a ``DNAscent`` subprogram that provides a probability estimate at each thymidine that a leftward- or rightward-moving fork moved through that position during the BrdU pulse. +``DNAscent forkSense`` is a ``DNAscent`` subprogram that interprets the pattern of BrdU and EdU incorporation on each molecule, segmenting the read to show where leftward- or rightward-moving forks were moving during the BrdU and EdU pulses. Usage ----- @@ -11,82 +11,82 @@ Usage .. code-block:: console To run DNAscent forkSense, do: - DNAscent forkSense -d /path/to/BrdUCalls.detect -o /path/to/output.forkSense + DNAscent forkSense -d /path/to/BrdUCalls.detect -o /path/to/output.forkSense --order EdU,BrdU Required arguments are: -d,--detect path to output file from DNAscent detect, - -o,--output path to output file for forkSense. + -o,--output path to output file for forkSense, + --order order in which the analogues were pulsed (EdU,BrdU or BrdU,EdU). Optional arguments are: -t,--threads number of threads (default: 1 thread), + --markAnalogues writes analogue incorporation locations to a bed file (default: off), --markOrigins writes replication origin locations to a bed file (default: off), --markTerminations writes replication termination locations to a bed file (default: off), --markForks writes replication fork locations to a bed file (default: off). -The only required input of ``DNAscent forkSense`` is the output file produced by ``DNAscent detect``. Note that the detect file must have been produced using the v2.0 ResNet algorithm; ``DNAscent forkSense`` is not compatible with legacy HMM-based detection. +The required inputs of ``DNAscent forkSense`` are the output file produced by ``DNAscent detect``, a new output file name for ``DNAscent forkSense`` to write on, and the order in which the analogues were pulsed. In the example command above, the ``--order`` flag indicates that EdU was pulsed first, and BrdU was pulsed second. The order of the pulses is important for determining fork direction and differentiating between origins and termination sites, but no information about the pulse length is needed. Note that the detect file must have been produced using the >v3.0.0 ResNet algorithm; ``DNAscent forkSense`` is not compatible with legacy HMM-based detection. Note further that >v3.0.0 ``DNAscent forkSense`` is not back compatible with the previous BrdU-only protocol, as it relies on the incorporation of both BrdU and EdU to determine fork direction. Users with data from a BrdU-only pulse-chase protocol should use DNAscent v2.0.2. -If the ``--markOrigins`` flag is passed, ``DNAscent forkSense`` will use detected leftward- and rightward-moving forks to infer the locations of fired replication origins and write these to a bed file called ``origins_DNAscent_forkSense.bed`` in the working directory. Likewise, if the ``--markTerminations`` flag is passed, termination sites will be recorded in a bed file called ``terminations_DNAscent_forkSense.bed``. Output ------ -If ``--markOrigins`` and/or ``--markTerminations`` were used, the resulting bed files has one called origin (for origins_DNAscent_forkSense.bed) or termination site (for terminations_DNAscent_forkSense.bed) per line and, in accordance with bed format, have the following space-separated columns: +Main Output File +^^^^^^^^^^^^^^^^ -* chromosome name, -* 5' boundary of the origin (or terminiation site), -* 3' boundary of the origin (or terminiation site), -* read header of the read that the call came from (similar to those in the output file of ``DNAscent detect``). +``DNAscent forkSense`` will produce a human-readable output file with the name and location that you specified using the ``-o`` flag. Like the output of ``DNAscent detect``, this file starts with a short header: -Note that the "resolution" of the calls (i.e., the third column minus the second column) will depend on your experimental setup. In synchronised early S-phase cells, this difference for origin calls is likely to be small as the leftward- and rightward-moving forks from a fired origin are nearby one another. In asynchronous or mid/late S-phase cells, the difference is likely to be larger as the forks from a single origin will have travelled some distance before the BrdU pulse. The bed files only specify the region between matching leftward- and rightward-moving forks. Any subsequent assumptions (such as assuming uniform fork speed and placing the origin in the middle of that region) are left to the user. +.. code-block:: console -The output of ``DNAscent forkSense`` is a file with similar formatting to that of ``DNAscent detect``. The format for the read headers is the same. From left to right, the tab-delimited columns indicate: + #DetectFile /path/to/DNAscent.detect + #Threads 1 + #Compute CPU + #SystemStartTime 10/06/2022 13:04:33 + #Software /path/to/DNAscent + #Version 3.0.0 + #Commit b9598a9e5bfa5f8314f92ba0f4fed39be1aee0be + #EstimatedRegionBrdU 0.559506 + #EstimatedRegionEdU 0.202767 -* the coordinate on the reference, -* probability that a leftward-moving fork passed through that coordinate during a BrdU pulse, -* probability that a rightward-moving fork passed through that coordinate during a BrdU pulse. +The fields in this header are analagous to the header from ``DNAscent detect``, but it includes two additional lines with an estimate of the thymidine-to-BrdU substitution rate in BrdU-positive regions and an estimate of the thymidine-to-EdU substitution rate in EdU-positive regions. In the example above, approximately 56% of thymidines are substituted for BrdU in BrdU-positive regions. -A low probability in both the second and third columns suggests it was unlikely that a fork passed through that position during the pulse. +The rest of this file has similar formatting to that of ``DNAscent detect``. The format for the read headers is the same. From left to right, the tab-delimited columns indicate: -The following example output shows the end of a read that was passed through by a leftward-moving fork: +* the coordinate on the reference, +* a Boolean (0 or 1) indicating whether that position is in an EdU-positive region, +* a Boolean (0 or 1) indicating whether that position is in an BrdU-positive region. + +The following example output shows an example: .. code-block:: console - >22c8a674-ed0e-475f-9c54-cb185299d923 chrII 173332 210452 fwd - 173339 0.687217 0.062620 - 173341 0.687217 0.062620 - 173342 0.687217 0.062620 - 173345 0.687217 0.062620 - 173347 0.687217 0.062620 - 173348 0.687217 0.062620 - 173349 0.743986 0.045767 - 173358 0.743986 0.045767 - 173375 0.743986 0.045767 - 173377 0.743986 0.045767 - 173378 0.743986 0.045767 - 173381 0.743986 0.045767 - 173382 0.806924 0.038138 - 173383 0.806924 0.038138 - 173387 0.806924 0.038138 - 173390 0.806924 0.038138 - 173392 0.806924 0.038138 - 173393 0.806924 0.038138 - 173398 0.846875 0.032027 - 173402 0.846875 0.032027 - 173404 0.846875 0.032027 - 173406 0.846875 0.032027 - 173407 0.846875 0.032027 - 173417 0.846875 0.032027 - 173418 0.906748 0.028587 - 173419 0.906748 0.028587 - 173423 0.906748 0.028587 - 173425 0.906748 0.028587 - 173426 0.906748 0.028587 - 173428 0.906748 0.028587 - 173441 0.909755 0.029341 - 173445 0.909755 0.029341 - 173446 0.909755 0.029341 - 173449 0.909755 0.029341 - 173450 0.909755 0.029341 - 173451 0.909755 0.029341 - 173454 0.907803 0.029983 + >806d1f69-1054-4b74-8356-d935a282a22e 11 1089865 1130164 fwd + 1089873 0 0 + 1089874 0 0 + 1089877 0 0 + 1089878 0 0 + 1089879 0 0 + 1089880 0 0 + 1089882 0 0 + 1089895 0 0 + 1089899 0 0 + +Only reads that have at least one BrdU-positive or EdU-positive segment are written to this file. Reads with no base analogue segments called on them are omitted from this file, as 0's everywhere across these reads is implied. Note that the format of this file has changed substantially from DNAscent v2.*. This design decision stems from a shift in the algorithm used, as well as the desire to avoid using excess disk space with redundant information. + + +Bed Files +^^^^^^^^^ + +If the ``--markOrigins`` flag is passed, ``DNAscent forkSense`` will write the genomic region between matched leftward- and rightward-moving forks to a bed file called ``origins_DNAscent_forkSense.bed`` in the working directory. Likewise, if the ``--markTerminations`` flag is passed, the genomic region between leftward- and rightward-moving forks moving towards each other will be recorded in a bed file called ``terminations_DNAscent_forkSense.bed``. The flag ``--markAnalogues`` will create two separate bed files: one containing the genomic location of BrdU-positive segments, and another containing the genomic location of EdU-positive segments. + +If the ``--markForks`` flag is passed, two bed files will be created in the working directory. The genomic location of leftward- and rightward-moving forks will be written to separate bed files called ``leftForks_DNAscent_forkSense.bed`` and ``rightForks_DNAscent_forkSense.bed``. + +All output bed files have the following space-separated columns: + +* chromosome name, +* 5' boundary of the origin (or terminiation site, or fork), +* 3' boundary of the origin (or terminiation site, or fork), +* read header of the read that the call came from (similar to those in the output file of ``DNAscent detect``). +For origins and termination sites, the "resolution" of the calls (i.e., the third column minus the second column) will depend on your experimental setup. In synchronised early S-phase cells, the genomic distance between the 5' and 3' boundaries likely to be small for origins and large for termination sites, as the leftward- and rightward-moving forks should be together near the origin. In asynchronous or mid/late S-phase cells, the origin calls may appear to be a "lower'' resolution (i.e., larger differences between the 5' and 3' boundaries) as the forks from a single origin will have travelled some distance before the pulses. When both forks are together at an origin, the origin bed file will record the midpoint of the analogue segment for the analogue that was pulsed first. +The bed files created by ``DNAscent forkSense`` can be opened directly with a genome browser. diff --git a/docs/source/index.rst b/docs/source/index.rst index ee9f9d3..11d79fa 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,44 +1,27 @@ -.. DNAscent documentation master file, created by - sphinx-quickstart on Fri Feb 7 18:58:49 2020. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. +.. _index: -DNAscent -==================================== +index +=============================== -.. toctree:: - :maxdepth: 1 - :caption: Contents: +``DNAscent index`` is a ``DNAscent`` subprogram that creates a map between Oxford Nanopore readIDs and fast5 files. This allows ``DNAscent detect`` to scan through bam files and pull out the relevant signal information for each read. - installation - index_subprog - detect - regions - forkSense - psl - visualisation - workflows - cookbook - releaseNotes +Usage +----- -Overview --------- +.. code-block:: console -DNAscent is software designed to detect the modified base BrdU in Oxford Nanopore reads. In an experimental setup where BrdU is incorporated into nascent DNA by replication forks, this software can be used to answer questions that were traditionally answered by DNA fibre analysis. + To run DNAscent index, do: + DNAscent index -f /path/to/fast5Directory + Required arguments are: + -f,--files path to fast5 files, + -s,--sequencing-summary path to sequencing summary file Guppy. + Optional arguments are: + -o,--output output file name (default is index.dnascent), + --GridION account for the different sequencing summary format used by in-built GridION basecalling. -DNAscent is under active development by the `Boemo Group `_ based in the `Department of Pathology, University of Cambridge `_. We aim to push regular updates and improvements, and incorporating new functionality is an active area of our computational research. +The required inputs to ``DNAscent index`` are the full path to the top-level directory containing the sequencing run's fast5 files (passed using the ``-f`` flag) and the sequencing_summary.txt file from Guppy (located in the top-level Guppy output directory). Note that the sequencing_summary.txt file created by onboard GridION basecalling has a slightly different format from that of the summary file generated by Guppy, and this can be accounted for with the ``--GridION`` flag. The default behaviour of ``DNAscent index`` is to place a file called ``index.dnascent`` in the working directory. The name of this file can be overridden using the ``-o`` flag. +Output +------- -Publications ------------- - -Please cite the following publication if you use DNAscent for your research: - -Boemo, MA. DNAscent v2: Detecting Replication Forks in Nanopore Sequencing Data with Deep Learning. BMC Genomics 2021;22:430. [`Journal DOI `_] - -Muller CA, Boemo MA, Spingardi P, Kessler, BM, Kriaucionis S, Simpson JT, Nieduszynski CA. Capturing the dynamics of genome replication on individual ultra-long nanopore sequence reads. Nature Methods 2019;16:429-436. [`Journal DOI `_] - -Bugs, Questions, and Comments ------------------------------ - -Should any bugs arise or if you have any questions about usage, please raise a `GitHub issue `_. If you have comments or suggestions to improve the software or the documentation, please Email Michael Boemo at mb915@cam.ac.uk. +``DNAscent index`` will put a file called ``index.dnascent`` in the current working directory (note that if you used the ``-o`` flag, then the file will have the name and location that you specified). This file will be needed as an input to ``DNAscent detect``. diff --git a/docs/source/index_subprog.rst b/docs/source/index_subprog.rst deleted file mode 100644 index 39213fe..0000000 --- a/docs/source/index_subprog.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. _index: - -Index -=============================== - -``DNAscent index`` is a ``DNAscent`` subprogram that creates a map between Oxford Nanopore readIDs and fast5 files. This allows ``DNAscent detect`` to scan through bam files and pull out the relevant signal information for each read. - -Usage ------ - -.. code-block:: console - - To run DNAscent index, do: - DNAscent index -f /path/to/fast5Directory - Required arguments are: - -f,--files path to fast5 files, - -s,--sequencing-summary path to sequencing summary file Guppy. - Optional arguments are: - -o,--output output file name (default is index.dnascent), - --GridION account for the different sequencing summary format used by in-built GridION basecalling. - -The first required input to ``DNAscent index`` is the full path to the top-level directory containing the sequencing run's fast5 files, passed using the ``-f`` flag. This will typically be the directory created with MinKNOW during sequencing. The second required input is the full path to the ``sequencing_summary.txt`` file, specified using the ``-s`` flag. This file is created by Guppy during basecalling and is located in the top level directory containing the Guppy-created fastq files. (Note that as of v2.0.3, providing the sequencing summary file is now required where it was optional in previous releases.) The default behaviour of ``DNAscent index`` is to place a file called ``index.dnascent`` in the working directory. The name of this file can be overridden using the ``-o`` flag. Note that the in-built version of Guppy on the GridION produces a sequencing summary file with a slightly different format than the version of Guppy available on the ONT Community website. Users can correct for this change of format from the GridION by adding the ``--GridION`` flag. - -Output -------- - -``DNAscent index`` will put a file called ``index.dnascent`` in the current working directory (note that if you used the ``-o`` flag, then the file will have the name and location that you specified). This file will be needed as an input to ``DNAscent detect``. diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 3ef6187..1694534 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -16,7 +16,7 @@ The DNAscent directory will appear in your current directory. Switch to the late .. code-block:: console cd DNAscent - git checkout 2.0.2 + git checkout 3.0.0 make This will put the DNAscent executable into the DNAscent/bin directory. Compilation requires a version of gcc that supports C++14, and a typical compile time for DNAscent and all of its dependencies is 5-7 minutes. @@ -32,35 +32,12 @@ Cloning the repository recursively (see above) will provide all the required dep Please note that the high throughput sequencing library (htslib) requires bzlib and lzma for compression. While these are common on most systems, if you don't have these, apt-get lzma-dev, liblzma-dev, and libbz2-dev. In addition, pfasta requires libbsd on Linux. -VBZ Fast5 Compression ---------------------- - -In new versions of MinKNOW, the fast5 files are compressed with VBZ Compression (see https://github.com/nanoporetech/vbz_compression). To use DNAscent on these compressed fast5 files, do the following (N.B., we're assuming you don't have root permissions): - -#. Go to https://github.com/nanoporetech/vbz_compression/releases and download the plugin appropriate for your processor architecture. In this example, we'll use ``ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz``. - -#. Download and unpack the plugin: - - .. code-block:: console - - wget https://github.com/nanoporetech/vbz_compression/releases/download/v1.0.1/ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz - tar -xf ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz - -#. Add the plugin to your path: - - .. code-block:: console - - export HDF5_PLUGIN_PATH=/full/path/to/ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/lib/plugin - -#. Run ``DNAscent detect`` as normal. - - GPU Use ------- -The ``DNAscent detect`` executable can make use of a GPU, although this is optional (see :ref:`detect`). DNAscent requires CUDA 10.0 and cuDNN 7.5, and information about these can be found at the following links: +The ``DNAscent detect`` executable can make use of a GPU, although this is optional (see :ref:`detect`). DNAscent requires CUDA 11.0 and cuDNN 8.0, and information about these can be found at the following links: * cuDNN: https://developer.nvidia.com/cudnn -* CUDA: https://developer.nvidia.com/cuda-10.0-download-archive +* CUDA: https://developer.nvidia.com/cuda-11.0-download-archive Always discuss any installation or version changes with your system administrator. diff --git a/docs/source/psl.rst b/docs/source/psl.rst deleted file mode 100644 index fe93405..0000000 --- a/docs/source/psl.rst +++ /dev/null @@ -1,30 +0,0 @@ -.. _psl: - -psl -=============================== - -``DNAscent psl`` is a ``DNAscent`` subprogram that writes a psl file to visualise the output of ``DNAscent detect``. - -Usage ------ - -.. code-block:: console - - To run DNAscent psl, do: - DNAscent psl -d /path/to/DNAscentOutput.detect -r /path/to/reference.fasta -o /path/to/psl_prefix - Required arguments are: - -d,--detect path to output file from DNAscent detect, - -r,--reference path to genome reference in fasta format, - -o,--output path to output bed prefix. - Optional arguments are: - --threshold probability above which a BrdU call is considered positive (default: 0.8), - --min minimum read length to compute (default is 1), - --max maximum read length to compute (default is Inf). - -The output file from ``DNAscent detect`` should be passed using the ``-d`` flag, and the reference genome used in the alignment should be passed with the ``-r`` flag. - - -Output ------- - -The output is a psl file with each positive BrdU call marked as a tick. These files can then be opened in IGV or the UCSC Genome Browser to visualise positive BrdU calls genome-wide. Note that psl tracks are only plotted from the location of the first tick, so in order to visualise the portions of each read before the first BrdU call and after the last BrdU call, a placeholder tick is placed at the first and last coordinate of each read. diff --git a/docs/source/regions.rst b/docs/source/regions.rst deleted file mode 100644 index f1a09f8..0000000 --- a/docs/source/regions.rst +++ /dev/null @@ -1,57 +0,0 @@ -.. _regions: - -Regions -=============================== - -``DNAscent regions`` is a ``DNAscent`` subprogram that interprets the output of ``DNAscent detect`` to call regions of high and low BrdU incorporation. - -Note that as of v2.0, ``DNAscent regions`` has been largely superceded by ``DNAscent forkSense`` and the increased accuracy of ``DNAscent detect`` makes visualising BrdU incorporation in regions mostly unnecessary. However, it is still included to avoid breaking legacy workflows, and it does still have some uses as explained below. - -Usage ------ - -.. code-block:: console - - To run DNAscent regions, do: - DNAscent regions -d /path/to/DNAscentOutput.detect -o /path/to/DNAscentOutput.regions - Required arguments are: - -d,--detect path to output file from DNAscent detect, - -o,--output path to output directory for bedgraph files. - Optional arguments (if used with default ResNet-based detect) are: - -r,--resolution number of thymidines in a region (default is 10). - Optional arguments (if used with HMM-based detect) are: - --threshold probability above which a BrdU call is considered positive (default: 0.8), - -c,--cooldown minimum gap between positive analogue calls (default: 4), - -r,--resolution minimum length of regions (default is 100 bp), - -p,--probability override probability that a thymidine 6mer contains a BrdU (default: automatically calculated), - -z,--zScore override zScore threshold for BrdU call (default: automatically calculated). - -The only required input of ``DNAscent regions`` is the output file produced by ``DNAscent detect``. ``DNAscent regions`` will first look through the detect file and determine the approximate fraction of thymidines replaced by BrdU in BrdU-positive regions. Using this probability, a z-score is assigned to each window (100 bp wide by default, but this can be changed using the ``-r`` flag) to indicate whether there is more or less BrdU than would be expected for an average BrdU-positive region. Naturally, some regions will be BrdU-positive but will have a substitution rate lower than average for BrdU-positive regions. Hence, ``DNAscent regions`` determines an appropriate boundary threshold between BrdU-positive regions and thymidine regions and rescales all of the z-scores so that this boundary is 0. ``DNAscent regions`` will calculate these values for you, but they can be overridden with the ``-p`` and ``-z`` flags, though this is generally not recommended. The exceptions are runs with 0% BrdU or runs where a high BrdU incorporation is expected along the entirety of each read. This is because these parameters are computed assuming that there are two populations (BrdU-positive and thymidine-only segments of DNA). - -In order to determine regions of high and low BrdU incorporation, ``DNAscent regions`` needs to count positive BrdU calls. By default, a thymidine is considered to be BrdU if it was scored with a probability higher than 0.8 by ``DNAscent detect``. This value was tuned in-house to optimise signal-to-noise, but it can be changed with the ``--threshold`` flag. Likewise, some care has to be given to how positive calls are counted, as BrdU can sometimes shift the signal of neighbouring thymidines. To prevent artefacts from overcounting while minimising undercounting, the default behaviour is to only make a positive call at most every 4 bases, though this can be changed with the ``-c`` flag. - - -Output ------- - -The output of DNAscent regions is a file with similar formatting to that of ``DNAscent detect``. The format for the read headers is the same. From left to right, the tab-delimited columns indicate: - -* the start of the region, -* the end of the region, -* the z-score, -* the string "BrdU" if the score is positive and "Thym" if the score is negative. - -A large positive z-score indicates high BrdU incorporation in that region, and a large negative score indicates very little BrdU incorporation in that region. An example output is as follows: - -.. code-block:: console - - >bfdc06e0-001f-41f7-bbea-f2f6785a3860 chrI 0 28066 fwd - 62 167 -2.38086 Thym - 173 276 -2.38086 Thym - 283 388 -2.27466 Thym - 393 499 -2.00741 Thym - 501 605 -2.00741 Thym - 606 708 -2.48397 Thym - 713 817 -2.27466 Thym - -Note that the region width may sometimes vary slightly from the value specified. The region width is designated as the coordinate of the first thymidine greater than the window width (100 bp by default) from the starting coordinate. In order to guard against assigning a score to regions with very few thymidines, ``DNAscent regions`` will also extend the region until at least 10 calls are considered. diff --git a/docs/source/releaseNotes.rst b/docs/source/releaseNotes.rst index b986384..90e07b4 100644 --- a/docs/source/releaseNotes.rst +++ b/docs/source/releaseNotes.rst @@ -3,7 +3,18 @@ Release Notes =============================== -v2.0.2 +v3.0.2 +----------------- + +* ``DNAscent detect`` now detects two different thymidine analogues, BrdU and EdU, in the same molecule, +* ``DNAscent forkSense`` now uses the spatial patterning of EdU and BrdU to determine fork direction as in DNA fibre, +* dnascent2bedgraph utility updated to plot both EdU and BrdU tracks in genome browsers, +* ``DNAscent regions`` is now deprecated and has been fully superceded by ``DNAscent forkSense``, +* ``DNAscent psl`` is now deprecated as reads can be more comprehensively plotted using the dnascent2bedgraph utility, +* Migration from Tensorflow 1.14 to 2.4.1 and, correspondingly, GPU usage now requires CUDA 11 and cuDNN 8, +* Released with `Totanes FIG, Gockel J, Chapman SE, Bartfai R, Boemo MA, Merrick CJ. Replication origin mapping in the malaria parasite Plasmodium falciparum. bioRxiv `_. + +v2.0.0 ----------------- * Migration from HMM-based BrdU detection at every thymidine to ResNet-based detection at every thymidine, @@ -11,9 +22,9 @@ v2.0.2 * Support for BrdU detection on GPUs, * ``DNAscent forkSense`` to call replication origins and termination sites in both synchronously and asynchronously replicating cells at any point in S-phase, * ``DNAscent align`` to align nanopore signals to reference, -* Significant increases to replication origin calling accuracy and sensitivity, +* Significant increases to replication origin calling accuracy, * Visualisation utility for plotting output of multiple DNAscent executables as bedgraphs, -* Released with `Boemo, MA. DNAscent v2: Detecting Replication Forks in Nanopore Sequencing Data with Deep Learning. bioRxiv 2020 `_. +* Released with `Boemo, MA. DNAscent v2: Detecting replication forks in nanopore sequencing data with deep learning. BMC Genomics 2021;22:430 `_. v1.0.0 ----------------- @@ -22,7 +33,7 @@ v1.0.0 * Improvements to BrdU detection accuracy, * ``DNAscent train`` to train Guassian mixture models from nanopolish eventalign. -v0.1.0 +v0.1 ----------------- * HMM-based BrdU detection at ~160 thymidine-containing 6mers, diff --git a/docs/source/visualisation.rst b/docs/source/visualisation.rst index 60e8bc0..216dfea 100644 --- a/docs/source/visualisation.rst +++ b/docs/source/visualisation.rst @@ -3,28 +3,28 @@ Visualisation =============================== -DNAscent supports multilevel analysis: We want users to be able to see the fork calls made by ``DNAscent forkSense`` and visualise them alongside individual base-pair resolution BrdU calls by ``DNAscent detect`` in order to see why these calls are being made. To that end, we include a visualisation utility in ``DNAscent/utils`` that formats the output of DNAscent executables (detect, regions, and forkSense) into bedgraphs that can be visualised with IGV or the UCSC Genome Browser. You can supply this utility with the output from one, two, or all three of these executables. If more than one is specified, the utility organises the bedgraphs so that the tracks for each read are grouped together. +DNAscent supports multilevel analysis: We want users to be able to see the fork calls made by ``DNAscent forkSense`` and visualise them alongside the individual base-pair resolution BrdU and EdU calls by ``DNAscent detect`` in order to see why these calls are being made. To that end, we include a visualisation utility in ``DNAscent/utils`` that formats the output of DNAscent executables (detect and forkSense) into bedgraphs that can be visualised with IGV or the UCSC Genome Browser. You can supply this utility with the output from one or two of these executables. If more than one is specified, the utility organises the bedgraphs so that the tracks for each read are grouped together. Usage ----- .. code-block:: console - dnascent2bedgraph.py: Converts the output of DNAscent detect, regions, and forkSense into bedgraphs. + dnascent2bedgraph.py: Converts the output of DNAscent detect and forkSense into bedgraphs. To run dnascent2bedgraph.py, do: python dnascent2bedgraph.py [arguments] Example: python dnascent2bedgraph.py -d /path/to/dnascentDetect.out -f /path/to/dnascentForksense.out -o /path/to/newBedgraphDir -n 1000 --minLength 10000 Required arguments are at least one of the following: -d,--detect path to DNAscent detect output file, - -f,--forkSense path to DNAscent forkSense output file, - -r,--regions path to DNAscent regions output file. + -f,--forkSense path to DNAscent forkSense output file. Required argument is: -o,--output output directory which will be created. Optional arguments are: --minLength only convert reads with specified minimum read length (in base pairs) into bedgraphs (default: 1), --maxLength only convert reads with specified maximum read length (in base pairs) into bedgraphs (default: Inf), -n,--maxReads maximum number of reads to convert into bedgraphs (default: Inf), + --targets forkSense bed file with specific reads to plot, --filesPerDir maximum reads per subdirectory (default: 300). A further example of how to use ``dnascent2bedgraph`` is given in :ref:`workflows`. @@ -32,4 +32,4 @@ A further example of how to use ``dnascent2bedgraph`` is given in :ref:`workflow Output ------ -``dnascent2bedgraph`` will create the directory you specified using the ``-o`` flag which will contain integer-numbered subdirectories. Each of these subdirectories will contain the bedgraphs for the number of reads specified by ``--filesPerDir`` (default is 300). If the output of more than one DNAscent executable was specified using the ``-d``, ``-f``, and ``-r`` flags, then the bedgraphs for each read will be grouped together so that they appear in IGV as consecutive tracks. +``dnascent2bedgraph`` will create the directory you specified using the ``-o`` flag which will contain integer-numbered subdirectories. Each of these subdirectories will contain the bedgraphs for the number of reads specified by ``--filesPerDir`` (default is 300). If the output of more than one DNAscent executable was specified using the ``-d`` and ``-f`` flags, then the bedgraphs for each read will be grouped together so that they appear in IGV as consecutive tracks. Rather than plotting bedgraphs for every read in the sequencing run, it is sometimes useful to plot bedgraphs of just those reads that have a replication feature (e.g., an origin) called on them. When a bed file of origin calls from DNAscent forkSense is passed using the ``--targets`` flag, dnascent2bedgraph will only write bedgraphs of the reads in the bed file. diff --git a/docs/source/workflows.rst b/docs/source/workflows.rst index 73375e1..38b9570 100644 --- a/docs/source/workflows.rst +++ b/docs/source/workflows.rst @@ -6,7 +6,7 @@ Workflow The following is a full DNAscent workflow, where we'll start off after Guppy has finished running (users that need help with Guppy should refer to the `Oxford Nanopore webpages `_). In particular, we assume the following: * you have a directory of 1D R9.5 or R9.4.1 450bp/s Oxford Nanopore fast5 reads (which may be in subdirectories) that you want to use for detection, -* these reads have been basecalled to fastq format using Albacore or Guppy (available from Oxford Nanopore), +* these reads have been basecalled to fastq format using Guppy (available from Oxford Nanopore), * you have a reference/genome file (in fasta format) for your reads. Example Workflow @@ -18,21 +18,22 @@ Download and compile DNAscent: git clone --recursive https://github.com/MBoemo/DNAscent.git cd DNAscent - git checkout 2.0.2 + git checkout 3.0.2 make - cd .. Concatenate the fastq files from Guppy: .. code-block:: console - cat /path/to/GuppyOutDirectory/*.fastq > reads.fastq + cat /path/to/GuppyOutDirectory/pass/*.fastq /path/to/GuppyOutDirectory/fail/*.fastq > reads.fastq + +Note that we recommend running DNAscent on reads that have passed and failed Guppy's QCs, hence concatenating them into a single fastq file above. Analogue-substituted reads (particularly if they are heavily substituted) are predisposed to failing Guppy's QCs, so only running DNAscent on Guppy's passed reads can disproportionately throw out the reads you are most interested in. DNAscent will do its own analogue-aware QCs at the ``DNAscent detect`` stage. Align the reads with `minimap2 `_ and sort with `samtools `_: .. code-block:: console - minimap2 -ax map-ont -o alignment.sam /path/to/reference.fasta reads.fastq + minimap2 -L -ax map-ont -o alignment.sam /path/to/reference.fasta reads.fastq samtools view -Sb -o alignment.bam alignment.sam samtools sort alignment.bam alignment.sorted samtools index alignment.sorted.bam @@ -55,6 +56,7 @@ Alternatively, if the system has a CUDA-compatible GPU in it, we can run ``nvidi .. code-block:: console + Thu Aug 20 21:06:57 2020 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 450.51.06 Driver Version: 450.51.06 CUDA Version: 11.0 | |-------------------------------+----------------------+----------------------+ @@ -83,29 +85,38 @@ From this, we can see that the GPU's device ID is 0 (just to the left of Tesla) Note that we're assuming the CUDA libraries for the GPU have been set up properly (see :ref:`installation`). If these libraries can't be accessed, DNAscent will splash a warning saying so and default back to using CPUs. -When ``DNAscent detect`` is finished, it will should put a file called ``output.detect`` in the current directory. We can look at the individual positive BrdU calls with ``DNAscent psl``. Let's create a psl file that shows any position where BrdU is called at 0.7 probability or higher: +When ``DNAscent detect`` is finished, there will be a file called ``output.detect`` in the current directory. At this point, we can make bedgraphs out of the ``DNAscent detect`` output (see :ref:`visualisation`) which can also be loaded into IGV or the UCSC Genome Browser. + +Lastly, we can run ``DNAscent forkSense`` on the output of ``DNAscent detect`` to measure replication fork movement. Suppose that in our experimental protocol, we pulsed BrdU first followed by EdU. Let's run it on four threads and specify that we want it to keep track of replication origins, forks, and termination sites: .. code-block:: console - DNAscent psl -d output.detect -r /full/path/to/reference.fasta -o output --threshold 0.7 + DNAscent forkSense -d output.detect -o output.forkSense -t 4 --markOrigins --markTerminations --markForks --order BrdU,EdU + +This will make the following files: -The resulting file ``output.psl`` can be loaded into IGV or the UCSC Genome Browser. At this point, we can make bedgraphs out of the ``DNAscent detect`` output (see :ref:`visualisation`) which can also be loaded into IGV or the UCSC Genome Browser. +* origins_DNAscent_forkSense.bed (with our origin calls), +* terminations_DNAscent_forkSense.bed (with our termination calls), +* two bed files (leftForks_DNAscent_forkSense.bed, rightForks_DNAscent_forkSense.bed) with our fork calls, +* output.forkSense. -Lastly, we can run ``DNAscent forkSense`` on the output of ``DNAscent detect`` to measure replication fork movement. Let's run it on four threads and specify that we want it to keep track of replication origins, forks, and termination sites: +We can load the bed files directly into IGV to see where origins, forks, and terminiations were called in the genome. + +We can visualise (see :ref:`visualisation`) output.forkSense by turning them into bedgraphs: .. code-block:: console - DNAscent forkSense -d output.detect -o output.forkSense -t 4 --markOrigins --markTerminations --markForks + python dnascent2bedgraph.py -d output.detect -f output.forkSense -o newBedgraphDirectory -This will make three files: origins_DNAscent_forkSense.bed (with our origin calls), terminations_DNAscent_forkSense.bed (with our termination calls), and output.forkSense. We can load the two bed files directly into IGV to see where origins and terminiations were called in the genome. +This will create a new directory called ``newBedgraphDirectory``. By passing both a ``forkSense`` and ``detect`` file to dnascent2bedgraph.py, the utility will convert them both into bedgraphs and organise them so that for each read, we can see the single-nt BrdU and EdU detection output from ``DNAscent detect`` right next to the left- and rightward-moving fork probabilities from ``DNAscent forkSense``. These bedgraphs can then be loaded into IGV or the UCSC Genome Browser. -We can visualise (see :ref:`visualisation`) the first 1500 reads of output.forkSense by turning them into bedgraphs: +Perhaps, however, we are only interested in viewing reads with origin calls on them. In this case, we can use the bed file generated above (origins_DNAscent_forkSense.bed) to specify that we only want bedgraphs of reads with origin calls on them. .. code-block:: console - python dnascent2bedgraph.py -d output.detect -f output.forkSense -o newBedgraphDirectory -n 1500 - -This will create a new directory called ``newBedgraphDirectory``. By passing both a ``forkSense`` and ``detect`` file to dnascent2bedgraph.py, the utility will convert them both into bedgraphs and organise them so that for each read, we can see the bp-resolution BrdU detection output from ``DNAscent detect`` right next to the left- and rightward-moving fork probabilities from ``DNAscent forkSense``. These bedgraphs can then be loaded into IGV or the UCSC Genome Browser. + python dnascent2bedgraph.py -d output.detect -f output.forkSense -o newBedgraphDirectory --targets origins_DNAscent_forkSense.bed + +This strategy works equally well for any of the bed files generated by DNAscent forkSense. Barcoding --------- diff --git a/src/DNAscent.cpp b/src/DNAscent.cpp index 370bd51..a31bb1f 100755 --- a/src/DNAscent.cpp +++ b/src/DNAscent.cpp @@ -1,5 +1,5 @@ //---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford +// Copyright 2019 University of Oxford // Written by Michael A. Boemo (mb915@cam.ac.uk) // This software is licensed under GPL-3.0. You should have // received a copy of the license with this software. If @@ -11,8 +11,6 @@ #include #include #include "detect.h" -#include "regions.h" -#include "psl.h" #include "forkSense.h" #include "index.h" #include "common.h" @@ -29,8 +27,6 @@ static std::map< std::string, std::function< int( int, char** ) > > executables {"index", index_main}, {"detect", detect_main}, {"forkSense", sense_main}, - {"psl", psl_main}, - {"regions", regions_main}, {"align", align_main}, {"trainCNN", data_main}, {"trainGMM", train_main}, @@ -65,20 +61,6 @@ std::vector< std::pair< double, double > > thymidineModel; /*main DNAscent executable that will link to other executables */ int main( int argc, char** argv ){ - /* suppresses tensorflow warnings - int env = setenv("TF_CPP_MIN_LOG_LEVEL", "2", 1); - if (env == -1){ - std::cerr << "Suppression of Tensorflow logs and warnings failed." << std::endl; - } - */ - - /* suppresses tensorflow warnings - int env = setenv("CUDA_VISIBLE_DEVICES", "", 1); - if (env == -1){ - std::cerr << "Suppression of Tensorflow logs and warnings failed." << std::endl; - } - */ - //load pore models thymidineModel = import_poreModel("template_median68pA.6mer.model"); analogueModel = import_poreModel("BrdU.model"); diff --git a/src/alignment.cpp b/src/alignment.cpp index f11db2e..91a998a 100644 --- a/src/alignment.cpp +++ b/src/alignment.cpp @@ -38,7 +38,8 @@ static const char *help= " -t,--threads number of threads (default is 1 thread),\n" " -m,--maxReads maximum number of reads to consider,\n" " -q,--quality minimum mapping quality (default is 20),\n" -" -l,--length minimum read length in bp (default is 100).\n" +" -l,--length minimum read length in bp (default is 100),\n" +" --useRaw write raw signal instead of events.\n" "Written by Michael Boemo, Department of Pathology, University of Cambridge.\n" "Please submit bug reports to GitHub Issues (https://github.com/MBoemo/DNAscent/issues)."; @@ -47,7 +48,7 @@ struct Arguments { std::string referenceFilename; std::string outputFilename; std::string indexFilename; - bool methylAware, capReads; + bool methylAware, capReads, useRaw; double divergence; int minQ, maxReads; int minL; @@ -83,6 +84,7 @@ Arguments parseAlignArguments( int argc, char** argv ){ args.methylAware = false; args.divergence = 0; args.capReads = false; + args.useRaw = false; args.maxReads = 0; args.dilation = 1.0; @@ -158,6 +160,11 @@ Arguments parseAlignArguments( int argc, char** argv ){ args.methylAware = true; i+=1; } + else if ( flag == "--useRaw" ){ + + args.useRaw = true; + i+=1; + } else throw InvalidOption( flag ); } if (args.outputFilename == args.indexFilename or args.outputFilename == args.referenceFilename or args.outputFilename == args.bamFilename) throw OverwriteFailure(); @@ -545,7 +552,8 @@ bool referenceDefined(std::string &readSnippet){ std::string eventalign( read &r, unsigned int totalWindowLength, - double signalDilation){ + double signalDilation, + bool useRaw){ std::string out; //get the positions on the reference subsequence where we could attempt to make a call @@ -614,6 +622,7 @@ std::string eventalign( read &r, } std::vector< double > eventSnippet; + std::vector< unsigned int > eventIndices; std::vector< double > eventLengthsSnippet; /*get the events that correspond to the read snippet */ @@ -632,6 +641,7 @@ std::string eventalign( read &r, double ev = (r.normalisedEvents)[(r.eventAlignment)[j].first]; if (ev > r.scalings.shift + 1.0 and ev < 250.0){ eventSnippet.push_back( ev ); + eventIndices.push_back( (r.eventAlignment)[j].first ); eventLengthsSnippet.push_back( (r.eventLengths)[(r.eventAlignment)[j].first] ); } } @@ -694,7 +704,7 @@ std::string eventalign( read &r, unsigned int evPos; std::string sixMerRef; if (r.isReverse){ - evPos = globalPosOnRef - pos; + evPos = globalPosOnRef - pos + 5; sixMerRef = reverseComplement(sixMerStrand); } else{ @@ -702,12 +712,30 @@ std::string eventalign( read &r, sixMerRef = sixMerStrand; } - if (label == "M"){ + if (useRaw){ + + unsigned int globalEvIdx = eventIndices[evIdx]; std::pair meanStd = thymidineModel[sixMer2index(sixMerStrand)]; - out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + sixMerStrand + "\t" + std::to_string(meanStd.first) + "\t" + std::to_string(meanStd.second) + "\n"; + for (unsigned int raw_i = r.eventIdx2rawIdx[globalEvIdx].first; raw_i <= r.eventIdx2rawIdx[globalEvIdx].second; raw_i++){ + + + if (label == "M"){ + out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string((r.raw[raw_i]- r.scalings.shift) / r.scalings.scale) + "\t" + std::to_string(0) + "\t" + sixMerStrand + "\t" + std::to_string(meanStd.first) + "\t" + std::to_string(meanStd.second) + "\n"; + } + else if (label == "I" and evIdx < lastM_ev){ //don't print insertions after the last match because we're going to align these in the next segment + out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string((r.raw[raw_i]- r.scalings.shift) / r.scalings.scale) + "\t" + std::to_string(0) + "\t" + "NNNNNN" + "\t" + "0" + "\t" + "0" + "\n"; + } + } } - else if (label == "I" and evIdx < lastM_ev){ //don't print insertions after the last match because we're going to align these in the next segment - out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + "NNNNNN" + "\t" + "0" + "\t" + "0" + "\n"; + else{ + + if (label == "M"){ + std::pair meanStd = thymidineModel[sixMer2index(sixMerStrand)]; + out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + sixMerStrand + "\t" + std::to_string(meanStd.first) + "\t" + std::to_string(meanStd.second) + "\n"; + } + else if (label == "I" and evIdx < lastM_ev){ //don't print insertions after the last match because we're going to align these in the next segment + out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + "NNNNNN" + "\t" + "0" + "\t" + "0" + "\n"; + } } evIdx ++; @@ -787,6 +815,7 @@ std::string eventalign( read &r, } std::vector< double > eventSnippet; + std::vector< unsigned int > eventIndices; std::vector< double > eventLengthsSnippet; /*get the events that correspond to the read snippet */ @@ -807,6 +836,7 @@ std::string eventalign( read &r, double ev = (r.normalisedEvents)[(r.eventAlignment)[j].first]; if (ev > r.scalings.shift + 1.0 and ev < 250.0){ eventSnippet.push_back( ev ); + eventIndices.push_back( (r.eventAlignment)[j].first ); eventLengthsSnippet.push_back( (r.eventLengths)[(r.eventAlignment)[j].first] ); } } @@ -869,7 +899,7 @@ std::string eventalign( read &r, unsigned int evPos; std::string sixMerRef; if (r.isReverse){ - evPos = globalPosOnRef + pos; + evPos = globalPosOnRef + pos + 5; sixMerRef = reverseComplement(sixMerStrand); } else{ @@ -877,12 +907,30 @@ std::string eventalign( read &r, sixMerRef = sixMerStrand; } - if (label == "M"){ + if (useRaw){ + + unsigned int globalEvIdx = eventIndices[evIdx]; std::pair meanStd = thymidineModel[sixMer2index(sixMerStrand)]; - lines.push_back(std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + sixMerStrand + "\t" + std::to_string(meanStd.first) + "\t" + std::to_string(meanStd.second) + "\n"); + for (unsigned int raw_i = r.eventIdx2rawIdx[globalEvIdx].first; raw_i <= r.eventIdx2rawIdx[globalEvIdx].second; raw_i++){ + + + if (label == "M"){ + out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string((r.raw[raw_i]- r.scalings.shift) / r.scalings.scale) + "\t" + std::to_string(0) + "\t" + sixMerStrand + "\t" + std::to_string(meanStd.first) + "\t" + std::to_string(meanStd.second) + "\n"; + } + else if (label == "I" and evIdx < lastM_ev){ //don't print insertions after the last match because we're going to align these in the next segment + out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string((r.raw[raw_i]- r.scalings.shift) / r.scalings.scale) + "\t" + std::to_string(0) + "\t" + "NNNNNN" + "\t" + "0" + "\t" + "0" + "\n"; + } + } } - else if (label == "I" and evIdx < lastM_ev){ //don't print insertions after the last match because we're going to align these in the next segment - lines.push_back(std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + "NNNNNN" + "\t" + "0" + "\t" + "0" + "\n"); + else{ + + if (label == "M"){ + std::pair meanStd = thymidineModel[sixMer2index(sixMerStrand)]; + lines.push_back(std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + sixMerStrand + "\t" + std::to_string(meanStd.first) + "\t" + std::to_string(meanStd.second) + "\n"); + } + else if (label == "I" and evIdx < lastM_ev){ //don't print insertions after the last match because we're going to align these in the next segment + lines.push_back(std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + "NNNNNN" + "\t" + "0" + "\t" + "0" + "\n"); + } } evIdx ++; @@ -908,9 +956,10 @@ std::string eventalign( read &r, std::string eventalign_train( read &r, - unsigned int totalWindowLength, - std::map &BrdULikelihood, - double signalDilation){ + unsigned int totalWindowLength, + std::map> &BrdULikelihood, + double signalDilation, + bool useRaw){ std::string out; //get the positions on the reference subsequence where we could attempt to make a call @@ -979,6 +1028,7 @@ std::string eventalign_train( read &r, } std::vector< double > eventSnippet; + std::vector< unsigned int > eventIndices; std::vector< double > eventLengthsSnippet; /*get the events that correspond to the read snippet */ @@ -997,6 +1047,7 @@ std::string eventalign_train( read &r, double ev = (r.normalisedEvents)[(r.eventAlignment)[j].first]; if (ev > r.scalings.shift + 1.0 and ev < 250.0){ eventSnippet.push_back( ev ); + eventIndices.push_back( (r.eventAlignment)[j].first ); eventLengthsSnippet.push_back( (r.eventLengths)[(r.eventAlignment)[j].first] ); } } @@ -1044,7 +1095,7 @@ std::string eventalign_train( read &r, evIdx = 0; for (size_t i = 0; i < stateLabels.size(); i++){ - std::string label = stateLabels[i].substr(stateLabels[i].find('_')+1); + std::string label = stateLabels[i].substr(stateLabels[i].find('_')+1); int pos = std::stoi(stateLabels[i].substr(0,stateLabels[i].find('_'))); if (label == "D") continue; //silent states don't emit an event @@ -1059,7 +1110,7 @@ std::string eventalign_train( read &r, unsigned int evPos; std::string sixMerRef; if (r.isReverse){ - evPos = globalPosOnRef - pos; + evPos = globalPosOnRef - pos + 5; sixMerRef = reverseComplement(sixMerStrand); } else{ @@ -1068,14 +1119,34 @@ std::string eventalign_train( read &r, } std::pair meanStd = thymidineModel[sixMer2index(sixMerStrand)]; - if (label == "M" and BrdULikelihood.count(evPos) > 0){ - out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + sixMerStrand + "\t" + std::to_string(meanStd.first) + "\t" + std::to_string(meanStd.second) + "\t" + std::to_string(BrdULikelihood[evPos]) + "\n"; - } - else if (label == "M"){ - out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + sixMerStrand + "\t" + std::to_string(meanStd.first) + "\t" + std::to_string(meanStd.second) + "\n"; + + if (useRaw){ + + unsigned int globalEvIdx = eventIndices[evIdx]; + for (unsigned int raw_i = r.eventIdx2rawIdx[globalEvIdx].first; raw_i <= r.eventIdx2rawIdx[globalEvIdx].second; raw_i++){ + + if (label == "M" and BrdULikelihood.count(evPos) > 0){ + out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string((r.raw[raw_i]- r.scalings.shift) / r.scalings.scale) + "\t" + std::to_string(0) + "\t" + sixMerStrand + "\t" + std::to_string(meanStd.first) + "\t" + std::to_string(meanStd.second) + "\t" + std::to_string(BrdULikelihood[evPos].first) + "\t" + std::to_string(BrdULikelihood[evPos].second) + "\n"; + } + else if (label == "M"){ + out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string((r.raw[raw_i]- r.scalings.shift) / r.scalings.scale) + "\t" + std::to_string(0) + "\t" + sixMerStrand + "\t" + std::to_string(meanStd.first) + "\t" + std::to_string(meanStd.second) + "\n"; + } + else if (label == "I" and evIdx < lastM_ev){ //don't print insertions after the last match because we're going to align these in the next segment + out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string((r.raw[raw_i]- r.scalings.shift) / r.scalings.scale) + "\t" + std::to_string(0) + "\t" + "NNNNNN" + "\t" + "0" + "\t" + "0" + "\n"; + } + } } - else if (label == "I" and evIdx < lastM_ev){ //don't print insertions after the last match because we're going to align these in the next segment - out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + "NNNNNN" + "\t" + "0" + "\t" + "0" + "\n"; + else{ + + if (label == "M" and BrdULikelihood.count(evPos) > 0){ + out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + sixMerStrand + "\t" + std::to_string(meanStd.first) + "\t" + std::to_string(meanStd.second) + "\t" + std::to_string(BrdULikelihood[evPos].first)+ "\t" + std::to_string(BrdULikelihood[evPos].second) + "\n"; + } + else if (label == "M"){ + out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + sixMerStrand + "\t" + std::to_string(meanStd.first) + "\t" + std::to_string(meanStd.second) + "\n"; + } + else if (label == "I" and evIdx < lastM_ev){ //don't print insertions after the last match because we're going to align these in the next segment + out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + "NNNNNN" + "\t" + "0" + "\t" + "0" + "\n"; + } } evIdx ++; @@ -1145,6 +1216,7 @@ std::string eventalign_train( read &r, } std::vector< double > eventSnippet; + std::vector< unsigned int > eventIndices; std::vector< double > eventLengthsSnippet; /*get the events that correspond to the read snippet */ @@ -1165,6 +1237,7 @@ std::string eventalign_train( read &r, double ev = (r.normalisedEvents)[(r.eventAlignment)[j].first]; if (ev > r.scalings.shift + 1.0 and ev < 250.0){ eventSnippet.push_back( ev ); + eventIndices.push_back( (r.eventAlignment)[j].first ); eventLengthsSnippet.push_back( (r.eventLengths)[(r.eventAlignment)[j].first] ); } } @@ -1213,9 +1286,9 @@ std::string eventalign_train( read &r, for (size_t i = 0; i < stateLabels.size(); i++){ std::string label = stateLabels[i].substr(stateLabels[i].find('_')+1); - int pos = std::stoi(stateLabels[i].substr(0,stateLabels[i].find('_'))); + int pos = std::stoi(stateLabels[i].substr(0,stateLabels[i].find('_'))); - if (label == "D") continue; //silent states don't emit an event + if (label == "D") continue; //silent states don't emit an event std::string sixMerStrand = (r.referenceSeqMappedTo).substr(posOnRef - pos - 6, 6); @@ -1227,7 +1300,7 @@ std::string eventalign_train( read &r, unsigned int evPos; std::string sixMerRef; if (r.isReverse){ - evPos = globalPosOnRef + pos; + evPos = globalPosOnRef + pos + 5; sixMerRef = reverseComplement(sixMerStrand); } else{ @@ -1236,17 +1309,37 @@ std::string eventalign_train( read &r, } std::pair meanStd = thymidineModel[sixMer2index(sixMerStrand)]; - if (label == "M" and BrdULikelihood.count(evPos) > 0){ - lines.push_back(std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + sixMerStrand + "\t" + std::to_string(meanStd.first) + "\t" + std::to_string(meanStd.second) + "\t" + std::to_string(BrdULikelihood[evPos]) + "\n"); - } - else if (label == "M"){ - lines.push_back(std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + sixMerStrand + "\t" + std::to_string(meanStd.first) + "\t" + std::to_string(meanStd.second) + "\n"); + + if (useRaw){ + + unsigned int globalEvIdx = eventIndices[evIdx]; + for (unsigned int raw_i = r.eventIdx2rawIdx[globalEvIdx].first; raw_i <= r.eventIdx2rawIdx[globalEvIdx].second; raw_i++){ + + if (label == "M" and BrdULikelihood.count(evPos) > 0){ + out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string((r.raw[raw_i]- r.scalings.shift) / r.scalings.scale) + "\t" + std::to_string(0) + "\t" + sixMerStrand + "\t" + std::to_string(meanStd.first) + "\t" + std::to_string(meanStd.second) + "\t" + std::to_string(BrdULikelihood[evPos].first) + "\t" + std::to_string(BrdULikelihood[evPos].second) + "\n"; + } + else if (label == "M"){ + out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string((r.raw[raw_i]- r.scalings.shift) / r.scalings.scale) + "\t" + std::to_string(0) + "\t" + sixMerStrand + "\t" + std::to_string(meanStd.first) + "\t" + std::to_string(meanStd.second) + "\n"; + } + else if (label == "I" and evIdx < lastM_ev){ //don't print insertions after the last match because we're going to align these in the next segment + out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string((r.raw[raw_i]- r.scalings.shift) / r.scalings.scale) + "\t" + std::to_string(0) + "\t" + "NNNNNN" + "\t" + "0" + "\t" + "0" + "\n"; + } + } } - else if (label == "I" and evIdx < lastM_ev){ //don't print insertions after the last match because we're going to align these in the next segment - lines.push_back(std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + "NNNNNN" + "\t" + "0" + "\t" + "0" + "\n"); + else{ + + if (label == "M" and BrdULikelihood.count(evPos) > 0){ + out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + sixMerStrand + "\t" + std::to_string(meanStd.first) + "\t" + std::to_string(meanStd.second) + "\t" + std::to_string(BrdULikelihood[evPos].first) + "\t" + std::to_string(BrdULikelihood[evPos].second) + "\n"; + } + else if (label == "M"){ + out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + sixMerStrand + "\t" + std::to_string(meanStd.first) + "\t" + std::to_string(meanStd.second) + "\n"; + } + else if (label == "I" and evIdx < lastM_ev){ //don't print insertions after the last match because we're going to align these in the next segment + out += std::to_string(evPos) + "\t" + sixMerRef + "\t" + std::to_string(scaledEvent) + "\t" + std::to_string(eventLength) + "\t" + "NNNNNN" + "\t" + "0" + "\t" + "0" + "\n"; + } } - evIdx ++; + evIdx ++; } //go again starting at posOnRef + lastM_ref using events starting at readHead + lastM_ev @@ -1267,6 +1360,8 @@ std::pair> eventalign_detect( read &r, unsigned int totalWindowLength, double signalDilation ){ + bool useRaw = true; + //get the positions on the reference subsequence where we could attempt to make a call std::string strand; int readHead = 0; @@ -1341,6 +1436,7 @@ std::cerr << "Out of reference sequence size: " << (r.referenceSeqMappedTo).leng } std::vector< double > eventSnippet; + std::vector< unsigned int > eventIndices; std::vector< double > eventLengthsSnippet; /*get the events that correspond to the read snippet */ @@ -1359,6 +1455,7 @@ std::cerr << "Out of reference sequence size: " << (r.referenceSeqMappedTo).leng double ev = (r.normalisedEvents)[(r.eventAlignment)[j].first]; if (ev > r.scalings.shift + 1.0 and ev < 250.0){ eventSnippet.push_back( ev ); + eventIndices.push_back( (r.eventAlignment)[j].first ); eventLengthsSnippet.push_back( (r.eventLengths)[(r.eventAlignment)[j].first] ); } } @@ -1399,14 +1496,14 @@ std::cerr << "Out of reference sequence size: " << (r.referenceSeqMappedTo).leng for (size_t i = 0; i < stateLabels.size(); i++){ std::string label = stateLabels[i].substr(stateLabels[i].find('_')+1); - int pos = std::stoi(stateLabels[i].substr(0,stateLabels[i].find('_'))); + int pos = std::stoi(stateLabels[i].substr(0,stateLabels[i].find('_'))); - if (label == "M"){ - lastM_ev = evIdx; - lastM_ref = pos; - } + if (label == "M"){ + lastM_ev = evIdx; + lastM_ref = pos; + } - if (label != "D") evIdx++; //silent states don't emit an event + if (label != "D") evIdx++; //silent states don't emit an event } //do a second pass to print the alignment @@ -1414,9 +1511,9 @@ std::cerr << "Out of reference sequence size: " << (r.referenceSeqMappedTo).leng for (size_t i = 0; i < stateLabels.size(); i++){ std::string label = stateLabels[i].substr(stateLabels[i].find('_')+1); - int pos = std::stoi(stateLabels[i].substr(0,stateLabels[i].find('_'))); + int pos = std::stoi(stateLabels[i].substr(0,stateLabels[i].find('_'))); - if (label == "D") continue; //silent states don't emit an event + if (label == "D") continue; //silent states don't emit an event std::string sixMerStrand = (r.referenceSeqMappedTo).substr(posOnRef + pos, 6); @@ -1428,7 +1525,7 @@ std::cerr << "Out of reference sequence size: " << (r.referenceSeqMappedTo).leng unsigned int evPos; std::string sixMerRef; if (r.isReverse){ - evPos = globalPosOnRef - pos; + evPos = globalPosOnRef - pos + 5; sixMerRef = reverseComplement(sixMerStrand); } else{ @@ -1437,10 +1534,20 @@ std::cerr << "Out of reference sequence size: " << (r.referenceSeqMappedTo).leng } if (label == "M"){ - ar -> addEvent(sixMerStrand, evPos, scaledEvent, eventLength); - } - evIdx ++; + if (useRaw){ + + unsigned int globalEvIdx = eventIndices[evIdx]; + for (unsigned int raw_i = r.eventIdx2rawIdx[globalEvIdx].first; raw_i <= r.eventIdx2rawIdx[globalEvIdx].second; raw_i++){ + + ar -> addEvent(sixMerStrand, evPos, (r.raw[raw_i]- r.scalings.shift) / r.scalings.scale, 0.); + } + } + else{ + ar -> addEvent(sixMerStrand, evPos, scaledEvent, eventLength); + } + } + evIdx ++; } @@ -1514,6 +1621,7 @@ std::cerr << "Out of reference sequence size: " << (r.referenceSeqMappedTo).leng } std::vector< double > eventSnippet; + std::vector< unsigned int > eventIndices; std::vector< double > eventLengthsSnippet; /*get the events that correspond to the read snippet */ @@ -1534,6 +1642,7 @@ std::cerr << "Out of reference sequence size: " << (r.referenceSeqMappedTo).leng double ev = (r.normalisedEvents)[(r.eventAlignment)[j].first]; if (ev > r.scalings.shift + 1.0 and ev < 250.0){ eventSnippet.push_back( ev ); + eventIndices.push_back( (r.eventAlignment)[j].first ); eventLengthsSnippet.push_back( (r.eventLengths)[(r.eventAlignment)[j].first] ); } } @@ -1602,7 +1711,7 @@ std::cerr << "Out of reference sequence size: " << (r.referenceSeqMappedTo).leng unsigned int evPos; std::string sixMerRef; if (r.isReverse){ - evPos = globalPosOnRef + pos; + evPos = globalPosOnRef + pos + 5; sixMerRef = reverseComplement(sixMerStrand); } else{ @@ -1611,7 +1720,17 @@ std::cerr << "Out of reference sequence size: " << (r.referenceSeqMappedTo).leng } if (label == "M"){ - ar -> addEvent(sixMerStrand, evPos, scaledEvent, eventLength); + if (useRaw){ + + unsigned int globalEvIdx = eventIndices[evIdx]; + for (unsigned int raw_i = r.eventIdx2rawIdx[globalEvIdx].first; raw_i <= r.eventIdx2rawIdx[globalEvIdx].second; raw_i++){ + + ar -> addEvent(sixMerStrand, evPos, (r.raw[raw_i]- r.scalings.shift) / r.scalings.scale, 0.); + } + } + else{ + ar -> addEvent(sixMerStrand, evPos, scaledEvent, eventLength); + } } evIdx ++; @@ -1751,7 +1870,7 @@ int align_main( int argc, char** argv ){ continue; } - std::string out = eventalign( r, windowLength, args.dilation); + std::string out = eventalign( r, windowLength, args.dilation, args.useRaw); #pragma omp critical { diff --git a/src/alignment.h b/src/alignment.h index cf11e7e..8d5f0a2 100644 --- a/src/alignment.h +++ b/src/alignment.h @@ -55,28 +55,26 @@ class AlignedPosition{ return refPos; } - std::vector makeFeature(void){ + std::vector makeFeature(void){ assert(events.size() > 0 && events.size() == lengths.size()); assert(sixMer.substr(0,1) == "A" || sixMer.substr(0,1) == "T" || sixMer.substr(0,1) == "G" || sixMer.substr(0,1) == "C"); + //one-hot encode bases - std::vector feature = {0., 0., 0., 0.}; + std::vector feature = {0., 0., 0., 0.}; if (sixMer.substr(0,1) == "A") feature[0] = 1.; else if (sixMer.substr(0,1) == "T") feature[1] = 1.; else if (sixMer.substr(0,1) == "G") feature[2] = 1.; else if (sixMer.substr(0,1) == "C") feature[3] = 1.; - - //events + + std::pair meanStd = thymidineModel[sixMer2index(sixMer)]; + + //event means double eventMean = vectorMean(events); - feature.push_back(eventMean); - - //event lengths double lengthsSum = vectorSum(lengths); + feature.push_back(eventMean); feature.push_back(lengthsSum); - - //pore model - std::pair meanStd = thymidineModel[sixMer2index(sixMer)]; feature.push_back(meanStd.first); feature.push_back(meanStd.second); @@ -139,17 +137,17 @@ class AlignedRead{ unsigned int getMappingUpper(void){ return mappingUpper; } - std::vector makeTensor(void){ + std::vector makeTensor(void){ assert(strand == "fwd" || strand == "rev"); - std::vector tensor; + std::vector tensor; tensor.reserve(NFEATURES * positions.size()); if (strand == "fwd"){ for (auto p = positions.begin(); p != positions.end(); p++){ - std::vector feature = (p -> second) -> makeFeature(); + std::vector feature = (p -> second) -> makeFeature(); tensor.insert(tensor.end(), feature.begin(), feature.end()); } } @@ -157,7 +155,7 @@ class AlignedRead{ for (auto p = positions.rbegin(); p != positions.rend(); p++){ - std::vector feature = (p -> second) -> makeFeature(); + std::vector feature = (p -> second) -> makeFeature(); tensor.insert(tensor.end(), feature.begin(), feature.end()); } } @@ -181,6 +179,62 @@ class AlignedRead{ } return out; } + std::string getCigar(void){ + + std::vector> cigarTuples; + if (strand == "fwd"){ + + int matchRun = 0; + + for (auto p = std::next(positions.begin()); p != positions.end(); p++){ + + //deletion + if ((p -> first) - (std::prev(p) -> first) != 1){ + + if (matchRun > 0) cigarTuples.push_back(std::make_pair(1,matchRun)); + matchRun = 0; + cigarTuples.push_back(std::make_pair(0,(p -> first) - (std::prev(p) -> first)-1)); + } + else{ + matchRun++; + } + } + if (matchRun > 0) cigarTuples.push_back(std::make_pair(1,matchRun)); + + } + else{ + + int matchRun = 0; + + for (auto p = std::next(positions.rbegin()); p != positions.rend(); p++){ + + //deletion + if ((p -> first) - (std::prev(p) -> first) != 1){ + + if (matchRun > 0) cigarTuples.push_back(std::make_pair(1,matchRun)); + matchRun = 0; + cigarTuples.push_back(std::make_pair(0,(p -> first) - (std::prev(p) -> first)-1)); + } + else{ + matchRun++; + } + } + if (matchRun > 0) cigarTuples.push_back(std::make_pair(1,matchRun)); + } + + //convert to string + std::string cigarString; + for (auto c = cigarTuples.begin(); c < cigarTuples.end(); c++){ + + //deletion + if (c -> first == 0) cigarString += std::to_string(c -> second) + "D"; + + //match + if (c -> first == 1) cigarString += std::to_string(c -> second) + "M"; + } + + return cigarString; + } std::vector getSixMers(void){ std::vector out; @@ -199,16 +253,16 @@ class AlignedRead{ } return out; } - std::pair getShape(void){ + std::vector getShape(void){ - return std::make_pair(positions.size(), NFEATURES); + return {positions.size(), NFEATURES}; } }; /*function prototypes */ int align_main( int argc, char** argv ); -std::string eventalign_train( read &, unsigned int , std::map &, double); +std::string eventalign_train( read &, unsigned int , std::map> &, double, bool); std::pair> eventalign_detect( read &, unsigned int, double ); #endif diff --git a/src/common.cpp b/src/common.cpp index 4e68a42..af2d747 100755 --- a/src/common.cpp +++ b/src/common.cpp @@ -1,5 +1,5 @@ //---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford +// Copyright 2019 University of Oxford // Written by Michael A. Boemo (mb915@cam.ac.uk) // This software is licensed under GPL-3.0. You should have // received a copy of the license with this software. If @@ -73,7 +73,7 @@ double vectorStdv( std::vector< double > &obs, double &mean ){ for ( size_t i = 0; i < obs.size(); i++ ){ total += pow(obs[i] - mean, 2.0); } - return total / (double) obs.size(); + return sqrt(total / (double) obs.size()); } @@ -102,7 +102,58 @@ int argMin( std::vector< double > vec ){ } -int argMax( std::vector< double > vec ){ +double logistic(double input, double slope, double centre){ + + return 1/(1 + exp (-slope*(abs(input)-centre))); +} + + +std::vector movingAvgFilter( std::vector &input, unsigned int filterSize){ + + std::vector out; + + for (size_t i = filterSize/2; i < input.size() - filterSize/2; i++){ + + double sum = 0.; + for (size_t j = i - filterSize/2; j < i + filterSize/2; j++){ + sum += input[j]; + } + out.push_back( sum / (double) filterSize ); + } + return out; +} + + +std::vector movingAvgFilterLogistic( std::vector &input, unsigned int filterSize){ + + std::vector out; + + for (size_t i = filterSize/2; i < input.size() - filterSize/2; i++){ + + double sum = 0.; + for (size_t j = i - filterSize/2; j < i + filterSize/2; j++){ + sum += input[j]; + } + out.push_back( logistic( sum / (double) filterSize, 20, 0.2)); + } + return out; +} + + +std::vector normVectorSum(std::vector input){ + + double sum = vectorSum(input); + + std::vector output; + for (size_t i = 0; i < input.size(); i++){ + + output.push_back(input[i]/sum); + } + return output; +} + + +int argMax(std::vector< double > vec){ double highest = vec[0]; int index = 0; diff --git a/src/common.h b/src/common.h index 49b9658..a61e739 100755 --- a/src/common.h +++ b/src/common.h @@ -1,5 +1,5 @@ //---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford +// Copyright 2019 University of Oxford // Written by Michael A. Boemo (mb915@cam.ac.uk) // This software is licensed under GPL-3.0. You should have // received a copy of the license with this software. If @@ -10,7 +10,7 @@ #ifndef COMMON_H #define COMMON_H -#define VERSION "2.0.3" +#define VERSION "3.0.2" #include #include @@ -187,5 +187,8 @@ int argMax( std::vector< double > ); double vectorMean( std::vector< double > & ); double vectorStdv( std::vector< double > &, double & ); double vectorSum( std::vector< double > & ); +std::vector movingAvgFilter(std::vector &, unsigned int); +std::vector movingAvgFilterLogistic(std::vector &, unsigned int); +std::vector normVectorSum(std::vector); #endif diff --git a/src/data_IO.cpp b/src/data_IO.cpp index 1cdf7a0..8c3ac0b 100755 --- a/src/data_IO.cpp +++ b/src/data_IO.cpp @@ -1,5 +1,5 @@ //---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford +// Copyright 2019 University of Oxford // Written by Michael A. Boemo (mb915@cam.ac.uk) // This software is licensed under GPL-3.0. You should have // received a copy of the license with this software. If @@ -15,16 +15,18 @@ #include #include #include +#include #include #include #include #include #include "data_IO.h" -#include "error_handling.h" #include "pfasta/pfasta.h" #include "poreModels.h" #include "gitcommit.h" #include "common.h" +#include "softwarepath.h" +#include "forkSense.h" std::string writeDetectHeader(std::string alignmentFilename, std::string refFilename, @@ -42,6 +44,12 @@ std::string writeDetectHeader(std::string alignmentFilename, if (useGPU) compMode = "GPU"; else compMode = "CPU"; + auto t = std::time(nullptr); + auto tm = *std::localtime(&t); + std::ostringstream oss; + oss << std::put_time(&tm, "%d/%m/%Y %H:%M:%S"); + auto str = oss.str(); + std::string out; out += "#Alignment " + alignmentFilename + "\n"; out += "#Genome " + refFilename + "\n"; @@ -51,7 +59,8 @@ std::string writeDetectHeader(std::string alignmentFilename, out += "#Mode " + detMode + "\n"; out += "#MappingQuality " + std::to_string(quality) + "\n"; out += "#MappingLength " + std::to_string(length) + "\n"; - out += "#SignalDilation " + std::to_string(dilation) + "\n"; + out += "#SystemStartTime " + str + "\n"; + out += "#Software " + std::string(executablePath) + "\n"; out += "#Version " + std::string(VERSION) + "\n"; out += "#Commit " + std::string(gitcommit) + "\n"; @@ -70,6 +79,12 @@ std::string writeRegionsHeader(std::string detectFile, if (useHMM) detMode = "HMM"; else detMode = "CNN"; + auto t = std::time(nullptr); + auto tm = *std::localtime(&t); + std::ostringstream oss; + oss << std::put_time(&tm, "%d/%m/%Y %H:%M:%S"); + auto str = oss.str(); + std::string out; out += "#DetectFile " + detectFile + "\n"; out += "#Mode " + detMode + "\n"; @@ -78,18 +93,8 @@ std::string writeRegionsHeader(std::string detectFile, out += "#Resolution " + std::to_string(resolution) + "\n"; out += "#Probability " + std::to_string(probability) + "\n"; out += "#ZScore " + std::to_string(zscore) + "\n"; - out += "#Version " + std::string(VERSION) + "\n"; - out += "#Commit " + std::string(gitcommit) + "\n"; - - return out; -} - -std::string writeForkSenseHeader(std::string detectFile, - int threads){ - - std::string out; - out += "#DetectFile " + detectFile + "\n"; - out += "#Threads " + std::to_string(threads) + "\n"; + out += "#SystemStartTime " + str + "\n"; + out += "#Software " + std::string(executablePath) + "\n"; out += "#Version " + std::string(VERSION) + "\n"; out += "#Commit " + std::string(gitcommit) + "\n"; @@ -183,15 +188,14 @@ std::map< std::string, std::string > import_reference_pfasta( std::string fastaF std::string getExePath(void){ - int PATH_MAX=1000; - char result[ PATH_MAX ]; - ssize_t count = readlink( "/proc/self/exe", result, PATH_MAX ); - const char *path; + std::string s(executablePath); + return s; +} + - if (count != -1) path = dirname(dirname(result)); - else throw MissingModelPath(); +std::string getGitCommit(void){ - std::string s(path); + std::string s(gitcommit); return s; } diff --git a/src/data_IO.h b/src/data_IO.h index 6b217de..088db01 100755 --- a/src/data_IO.h +++ b/src/data_IO.h @@ -53,6 +53,7 @@ struct read{ std::vector< double > raw, normalisedEvents, eventLengths; std::map< unsigned int, unsigned int > refToQuery; std::vector< std::pair< unsigned int, unsigned int > > eventAlignment; + std::map< unsigned int, std::pair< unsigned int, unsigned int >> eventIdx2rawIdx; std::map posToScore; EventAlignment alignmentQCs; int refStart, refEnd; @@ -73,9 +74,9 @@ std::map< std::string, std::string > import_reference( std::string ); std::map< std::string, std::string > import_reference_pfasta( std::string ); std::vector< std::pair< double, double > > import_poreModel( std::string ); std::string getExePath(void); +std::string getGitCommit(void); std::string writeDetectHeader(std::string, std::string, std::string, int, bool, unsigned int, unsigned int, double, bool); std::string writeRegionsHeader(std::string, double, bool, unsigned int, unsigned int, double, double); -std::string writeForkSenseHeader(std::string detectFile, int threads); unsigned int sixMer2index(std::string &); diff --git a/src/detect.cpp b/src/detect.cpp index 2adaf44..f718179 100755 --- a/src/detect.cpp +++ b/src/detect.cpp @@ -26,8 +26,8 @@ #include "../htslib/htslib/sam.h" #include "../tensorflow/include/tensorflow/c/eager/c_api.h" #include "htsInterface.h" -#include "tensor.h" -#include "alignment.h" +//#include "tensor.h" +//#include "alignment.h" #include "error_handling.h" #include @@ -350,29 +350,6 @@ std::vector< unsigned int > getPOIs( std::string &refSeq, int windowLength ){ } -std::string methylateSequence( std::string &inSeq ){ - - std::string outSeq = inSeq; - - for ( unsigned int i = 0; i < inSeq.size(); i++ ){ - - //CpG - if ( inSeq.substr(i,2) == "CG" ) outSeq.replace(i,1,"M"); - - //GpC - //if ( inSeq.substr(i,2) == "GC" ) outSeq.replace(i+1,1,"M"); - - //Dam methylation (methyl-adenine in GATC) - //if ( inSeq.substr(i,4) == "GATC" ) outSeq.replace(i+1,1,"M"); - - //Dcm methylation (methyl-cytosine second cytonsine of CCAGG and CCTGG) - //if ( inSeq.substr(i,5) == "CCAGG" ) outSeq.replace(i+1,1,"M"); - //if ( inSeq.substr(i,5) == "CCTGG" ) outSeq.replace(i+1,1,"M"); - } - return outSeq; -} - - std::string llAcrossRead( read &r, unsigned int windowLength, int &failedEvents, @@ -517,7 +494,7 @@ std::string llAcrossRead( read &r, std::string sixMerRef = (r.referenceSeqMappedTo).substr(posOnRef, 6); if ( r.isReverse ){ - globalPosOnRef = r.refEnd - posOnRef - 6; + globalPosOnRef = r.refEnd - posOnRef - 1; sixMerQuery = reverseComplement( sixMerQuery ); sixMerRef = reverseComplement( sixMerRef ); } @@ -728,83 +705,92 @@ std::cerr << logLikelihoodRatio << std::endl; } -TF_Tensor *read2tensor(std::shared_ptr r, const TensorShape &shape){ +void read2tensor(std::shared_ptr r, const TensorShape &shape, TF_Tensor *t){ - std::vector unformattedTensor = r -> makeTensor(); + std::vector unformattedTensor = r -> makeTensor(); size_t size = unformattedTensor.size(); //put a check in here for size - auto output_array = std::make_unique(size); + //auto output_array = std::make_unique(size); + std::cout << "mark1" << std::endl; + float *output_array = (float *)malloc(size*sizeof(float)); for(size_t i = 0; i < size; i++){ output_array[i] = unformattedTensor[i]; } - - auto output = tf_obj_unique_ptr(TF_NewTensor(TF_FLOAT, - shape.values, - shape.dim, - (void *)output_array.get(), - size*sizeof(float), - cpp_array_deallocator, - nullptr)); - if(output) output_array.release(); - - return output.release(); + std::cout << "mark2" << std::endl; + + t = TF_NewTensor(TF_FLOAT, + shape.values, + shape.dim, + (void *)output_array, + size*sizeof(float), + cpp_array_deallocator, + nullptr); + std::cout << "mark3" << std::endl; + free(output_array); } std::string runCNN(std::shared_ptr r, std::shared_ptr session){ - std::pair protoShape = r -> getShape(); - TensorShape input_shape={{1, (int64_t) protoShape.first, (int64_t) protoShape.second}, 3}; - auto input_values = tf_obj_unique_ptr(read2tensor(r, input_shape)); - if(!input_values){ - std::cerr << "Tensor creation failure." << std::endl; - exit (EXIT_FAILURE); + int NumInputs = 1; + int NumOutputs = 1; + + std::vector protoShape = r -> getShape(); + TensorShape input_shape={{1, (int64_t) protoShape[0], (int64_t) protoShape[1]}, 3}; + + std::vector unformattedTensor = r -> makeTensor(); + + size_t size = unformattedTensor.size(); + assert(size > 0); + + float *tmp_array = (float *)malloc(size*sizeof(float)); + for(size_t i = 0; i < size; i++){ + tmp_array[i] = unformattedTensor[i]; } - CStatus status; - TF_Tensor* inputs[]={input_values.get()}; - TF_Tensor* outputs[1]={}; + TF_Tensor* InputValues = TF_NewTensor(TF_FLOAT, + input_shape.values, + input_shape.dim, + (void *)tmp_array, + size*sizeof(float), + cpp_array_deallocator, + nullptr); - TF_SessionRun(*(session->session.get()), nullptr, - &session->inputs, inputs, 1, - &session->outputs, outputs, 1, - nullptr, 0, nullptr, status.ptr); + TF_Tensor* OutputValues; - auto _output_holder = tf_obj_unique_ptr(outputs[0]); + //Run the Session + CStatus status; + TF_SessionRun(*(session->session.get()), NULL, &session->inputs, &InputValues, NumInputs, &session->outputs, &OutputValues, NumOutputs, NULL, 0, NULL, status.ptr); - if(status.failure()){ - status.dump_error(); - exit (EXIT_FAILURE); + if(TF_GetCode(status.ptr) != TF_OK) + { + printf("%s",TF_Message(status.ptr)); } - TF_Tensor &output = *outputs[0]; - if(TF_TensorType(&output) != TF_FLOAT){ + if(TF_TensorType(OutputValues) != TF_FLOAT){ std::cerr << "Error, unexpected output tensor type." << std::endl; exit (EXIT_FAILURE); } - std::string str_output; - unsigned int outputFields = 2; - - //write the header - str_output += ">" + r -> getReadID() + " " + r -> getChromosome() + " " + std::to_string(r -> getMappingLower()) + " " + std::to_string(r -> getMappingUpper()) + " " + r -> getStrand() + "\n"; //header + unsigned int outputFields = 3; //get positions on the read reference to write the output std::vector positions = r -> getPositions(); std::vector sixMers = r -> getSixMers(); - size_t output_size = TF_TensorByteSize(&output) / sizeof(float); - assert(output_size == protoShape.first * outputFields); - auto output_array = (const float *)TF_TensorData(&output); + size_t output_size = TF_TensorByteSize(OutputValues) / sizeof(float); + assert(output_size == protoShape[0] * outputFields); + float *output_array = (float *)TF_TensorData(OutputValues); //write the output unsigned int pos = 0; std::vector lines; lines.reserve(positions.size()); std::string thisPosition = std::to_string(positions[0]); - std::string str_line; + std::string str_line, str_output; + str_output += ">" + r -> getReadID() + " " + r -> getChromosome() + " " + std::to_string(r -> getMappingLower()) + " " + std::to_string(r -> getMappingUpper()) + " " + r -> getStrand() + "\n"; //header for(size_t i = 0; i < output_size; i++){ if((i+1)%outputFields==0){ @@ -814,7 +800,7 @@ std::string runCNN(std::shared_ptr r, std::shared_ptr continue; } - str_line += thisPosition + "\t" + std::to_string(output_array[i]); + str_line += thisPosition + "\t" + std::to_string(output_array[i])+ "\t" + std::to_string(output_array[i-1]); if (r -> getStrand() == "rev") str_line += "\t" + reverseComplement(sixMers[pos]); else str_line += "\t" + sixMers[pos]; lines.push_back(str_line); @@ -826,6 +812,9 @@ std::string runCNN(std::shared_ptr r, std::shared_ptr } } + TF_DeleteTensor(OutputValues); + TF_DeleteTensor(InputValues); + if (r -> getStrand() == "rev") std::reverse(lines.begin(),lines.end()); for (auto s = lines.begin(); s < lines.end(); s++){ @@ -836,6 +825,87 @@ std::string runCNN(std::shared_ptr r, std::shared_ptr } +std::map> runCNN_training(std::shared_ptr r, std::shared_ptr session){ + + int NumInputs = 1; + int NumOutputs = 1; + + std::map> analogueCalls; + + std::vector protoShape = r -> getShape(); + TensorShape input_shape={{1, (int64_t) protoShape[0], (int64_t) protoShape[1]}, 3}; + + std::vector unformattedTensor = r -> makeTensor(); + + size_t size = unformattedTensor.size(); + assert(size > 0); + + float *tmp_array = (float *)malloc(size*sizeof(float)); + for(size_t i = 0; i < size; i++){ + tmp_array[i] = unformattedTensor[i]; + } + + TF_Tensor* InputValues = TF_NewTensor(TF_FLOAT, + input_shape.values, + input_shape.dim, + (void *)tmp_array, + size*sizeof(float), + cpp_array_deallocator, + nullptr); + + TF_Tensor* OutputValues; + + //Run the Session + CStatus status; + TF_SessionRun(*(session->session.get()), NULL, &session->inputs, &InputValues, NumInputs, &session->outputs, &OutputValues, NumOutputs, NULL, 0, NULL, status.ptr); + + if(TF_GetCode(status.ptr) != TF_OK) + { + printf("%s",TF_Message(status.ptr)); + } + + if(TF_TensorType(OutputValues) != TF_FLOAT){ + std::cerr << "Error, unexpected output tensor type." << std::endl; + exit (EXIT_FAILURE); + } + + unsigned int outputFields = 3; + + //get positions on the read reference to write the output + std::vector positions = r -> getPositions(); + std::vector sixMers = r -> getSixMers(); + + size_t output_size = TF_TensorByteSize(OutputValues) / sizeof(float); + assert(output_size == protoShape[0] * outputFields); + float *output_array = (float *)TF_TensorData(OutputValues); + + //write the output + unsigned int pos = 0; + unsigned int thisPosition = positions[0]; + for(size_t i = 0; i < output_size; i++){ + if((i+1)%outputFields==0){ + + //only output T positions + if (sixMers[pos].substr(0,1) != "T"){ + pos++; + continue; + } + + analogueCalls[thisPosition] = std::make_pair(output_array[i],output_array[i-1]); + pos++; + } + else{ + if (i != output_size-1) thisPosition = positions[pos]; + } + } + + TF_DeleteTensor(OutputValues); + TF_DeleteTensor(InputValues); + + return analogueCalls; +} + + int detect_main( int argc, char** argv ){ Arguments args = parseDetectArguments( argc, argv ); @@ -847,15 +917,16 @@ int detect_main( int argc, char** argv ){ //get the neural network model path std::string pathExe = getExePath(); - std::string modelPath = pathExe + "/dnn_models/" + "BrdU_detect.pb"; + std::string modelPath = pathExe + "dnn_models/detect_model_BrdUEdU/"; + std::string input_layer_name = "serving_default_input_1"; + std::shared_ptr session; - if (args.useGPU) { - session = model_load_gpu(modelPath.c_str(), "input_1", "time_distributed/Reshape_1",args.GPUdevice,args.threads); + if (not args.useGPU){ + session = model_load_cpu(modelPath.c_str(), args.threads, input_layer_name.c_str()); } else{ - session = model_load_cpu(modelPath.c_str(), "input_1", "time_distributed/Reshape_1",args.threads); - + session = model_load_gpu(modelPath.c_str(), args.GPUdevice, args.threads, input_layer_name.c_str()); } //import fasta reference @@ -865,7 +936,7 @@ int detect_main( int argc, char** argv ){ if ( not outFile.is_open() ) throw IOerror( args.outputFilename ); //write the outfile header - std::string outHeader = writeDetectHeader(args.bamFilename, args.referenceFilename, args.indexFilename, args.threads, false, args.minQ, args.minL, args.dilation,args.useGPU); + std::string outHeader = writeDetectHeader(args.bamFilename, args.referenceFilename, args.indexFilename, args.threads, false, args.minQ, args.minL, args.dilation, args.useGPU); outFile << outHeader; htsFile* bam_fh; @@ -902,6 +973,7 @@ int detect_main( int argc, char** argv ){ unsigned int maxBufferSize; std::vector< bam1_t * > buffer; maxBufferSize = 16*(args.threads); + //maxBufferSize = args.threads; //if ( args.threads <= 4 ) maxBufferSize = args.threads; //else maxBufferSize = 4*(args.threads); @@ -927,9 +999,7 @@ int detect_main( int argc, char** argv ){ /*if we've filled up the buffer with short reads, compute them in parallel */ if (buffer.size() >= maxBufferSize or (buffer.size() > 0 and result == -1 ) ){ - //std::vector>> buffer_ar(buffer.size()); - - #pragma omp parallel for schedule(dynamic) shared(buffer,windowLength_align,analogueModel,thymidineModel,args,prog,failed) num_threads(args.threads) + #pragma omp parallel for schedule(dynamic) shared(buffer,windowLength_align,analogueModel,thymidineModel,args,prog,failed,session) num_threads(args.threads) for (unsigned int i = 0; i < buffer.size(); i++){ read r; @@ -983,13 +1053,16 @@ int detect_main( int argc, char** argv ){ } std::string readOut = runCNN(ar.second,session); - prog++; + prog++; + pb.displayProgress( prog, failed, failedEvents ); + #pragma omp critical { outFile << readOut; pb.displayProgress( prog, failed, failedEvents ); } + } diff --git a/src/detect.h b/src/detect.h index 79978f9..3684fec 100755 --- a/src/detect.h +++ b/src/detect.h @@ -1,5 +1,5 @@ //---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford +// Copyright 2019 University of Oxford // Written by Michael A. Boemo (mb915@cam.ac.uk) // This software is licensed under GPL-3.0. You should have // received a copy of the license with this software. If @@ -13,6 +13,8 @@ #include #include #include "data_IO.h" +#include "alignment.h" +#include "tensor.h" /*function prototypes */ @@ -22,5 +24,6 @@ std::vector< unsigned int > getPOIs( std::string &, int ); void parseIndex( std::string, std::map< std::string, std::string > &, bool & ); double sequenceProbability( std::vector &, std::string &, size_t, bool, PoreParameters, size_t, size_t ); std::map llAcrossRead_forTraining( read &, unsigned int); +std::map> runCNN_training(std::shared_ptr r, std::shared_ptr session); #endif diff --git a/src/error_handling.h b/src/error_handling.h index bb07607..20914b7 100755 --- a/src/error_handling.h +++ b/src/error_handling.h @@ -1,5 +1,5 @@ //---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford +// Copyright 2019 University of Oxford // Written by Michael A. Boemo (mb915@cam.ac.uk) // This software is licensed under GPL-3.0. You should have // received a copy of the license with this software. If diff --git a/src/event_handling.cpp b/src/event_handling.cpp index bcc2be6..9087760 100755 --- a/src/event_handling.cpp +++ b/src/event_handling.cpp @@ -56,11 +56,7 @@ void bulk_getEvents( std::string fast5Filename, std::string readID, std::vector< //open the file hid_t hdf5_file = H5Fopen(fast5Filename.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT); - if (hdf5_file < 0){ - std::cerr << "ReadID " << readID << " is in the bam file but possibly not in the DNAscent index file." << std::endl; - std::cerr << "Ensure fast5 files were not renamed after indexing and that all basecalled fast5 files were indexed." << std::endl; - throw IOerror(fast5Filename.c_str()); - } + if (hdf5_file < 0) throw IOerror(fast5Filename.c_str()); //get the channel parameters std::string scaling_path = "/read_" + readID + "/channel_id"; @@ -103,15 +99,11 @@ void bulk_getEvents( std::string fast5Filename, std::string readID, std::vector< } -void getEvents( std::string fast5Filename, std::string readID, std::vector &raw, float &sample_rate ){ +void getEvents( std::string fast5Filename, std::vector &raw, float &sample_rate ){ //open the file hid_t hdf5_file = H5Fopen(fast5Filename.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT); - if (hdf5_file < 0){ - std::cerr << "ReadID " << readID << " is in the bam file but possibly not in the DNAscent index file." << std::endl; - std::cerr << "Ensure fast5 files were not renamed after indexing and that all basecalled fast5 files were indexed." << std::endl; - throw IOerror(fast5Filename.c_str()); - } + if (hdf5_file < 0) throw IOerror(fast5Filename.c_str()); //get the channel parameters const char *scaling_path = "/UniqueGlobalKey/channel_id"; @@ -565,7 +557,6 @@ void adaptive_banded_simple_event_align( std::vector< double > &raw, read &r, Po double yi = (event - rescale.shift - rescale.scale*mu); rescale.var += yi * yi / (stdv * stdv); - //nNormalised++; } rescale.var /= raw.size();//(double) nNormalised; rescale.var = sqrt(rescale.var); @@ -631,7 +622,7 @@ void normaliseEvents( read &r, bool bulkFast5 ){ try{ if (bulkFast5) bulk_getEvents(r.filename, r.readID, r.raw, sample_rate); - else getEvents( r.filename, r.readID, r.raw, sample_rate); + else getEvents( r.filename, r.raw, sample_rate); } catch ( BadFast5Field &bf5 ){ @@ -646,15 +637,33 @@ void normaliseEvents( read &r, bool bulkFast5 ){ //get the event mean and length r.normalisedEvents.reserve(et.n); r.eventLengths.reserve(et.n); + unsigned int rawStart = 0; for ( unsigned int i = 0; i < et.n; i++ ){ if (et.event[i].mean > 1.0) { + + if (i > 0) r.eventIdx2rawIdx[i-1] = std::make_pair(rawStart,et.event[i].start-1); + r.normalisedEvents.push_back( et.event[i].mean ); r.eventLengths.push_back(et.event[i].length / sample_rate); + + rawStart = et.event[i].start; } } + r.eventIdx2rawIdx[et.n-1] = std::make_pair(rawStart,r.raw.size()-1); free(et.event); + //testing - print the event and the raw signals that were used to make it + /* + for (auto e = r.eventIdx2rawIdx.begin(); e != r.eventIdx2rawIdx.end(); e++ ){ + + std::cout << r.normalisedEvents[e -> first] << std::endl; + for (unsigned int e1 = (e->second).first; e1 <= (e->second).second; e1++){ + std::cout << "<" << r.raw[e1] << std::endl; + } + } + */ + // Precompute k-mer ranks for rough rescaling and banded alignment size_t k = 6; size_t n_kmers = r.basecall.size() - k + 1; diff --git a/src/forkSense.cpp b/src/forkSense.cpp index 503d312..287213d 100755 --- a/src/forkSense.cpp +++ b/src/forkSense.cpp @@ -1,5 +1,5 @@ //---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford +// Copyright 2020 University of Cambridge // Written by Michael A. Boemo (mb915@cam.ac.uk) // This software is licensed under GPL-3.0. You should have // received a copy of the license with this software. If @@ -8,7 +8,6 @@ #include "../tensorflow/include/tensorflow/c/eager/c_api.h" #include -#include "regions.h" #include "data_IO.h" #include "trainGMM.h" #include "forkSense.h" @@ -23,32 +22,25 @@ static const char *help= "forkSense: DNAscent AI executable that calls replication origins, termination sites, and fork movement.\n" "To run DNAscent forkSense, do:\n" -" DNAscent forkSense -d /path/to/BrdUCalls.detect -o /path/to/output.forkSense\n" +" DNAscent forkSense -d /path/to/BrdUCalls.detect -o /path/to/output.forkSense --order EdU,BrdU\n" "Required arguments are:\n" " -d,--detect path to output file from DNAscent detect,\n" -" -o,--output path to output file for forkSense.\n" +" -o,--output path to output file for forkSense,\n" +" --order order in which the analogues were pulsed (EdU,BrdU or BrdU,EdU).\n" "Optional arguments are:\n" " -t,--threads number of threads (default: 1 thread),\n" +" --markAnalogues writes analogue incorporation locations to a bed file (default: off),\n" " --markOrigins writes replication origin locations to a bed file (default: off),\n" " --markTerminations writes replication termination locations to a bed file (default: off),\n" " --markForks writes replication fork locations to a bed file (default: off).\n" "Written by Michael Boemo, Department of Pathology, University of Cambridge.\n" "Please submit bug reports to GitHub Issues (https://github.com/MBoemo/DNAscent/issues)."; -struct Arguments { - std::string detectFilename; - std::string outputFilename; - bool markOrigins = false; - bool markTerms = false; - bool markForks = false; - unsigned int threads = 1; -}; - -Arguments parseSenseArguments( int argc, char** argv ){ +forkSenseArgs parseSenseArguments( int argc, char** argv ){ if( argc < 2 ){ - std::cout << "Exiting with error. Insufficient arguments passed to DNAscent regions." << std::endl << help << std::endl; + std::cout << "Exiting with error. Insufficient arguments passed to DNAscent forkSense." << std::endl << help << std::endl; exit(EXIT_FAILURE); } if ( std::string( argv[ 1 ] ) == "-h" or std::string( argv[ 1 ] ) == "--help" ){ @@ -56,11 +48,15 @@ Arguments parseSenseArguments( int argc, char** argv ){ exit(EXIT_SUCCESS); } else if( argc < 4 ){ - std::cout << "Exiting with error. Insufficient arguments passed to DNAscent regions." << std::endl; + std::cout << "Exiting with error. Insufficient arguments passed to DNAscent forkSense." << std::endl; exit(EXIT_FAILURE); } - Arguments args; + forkSenseArgs args; + + bool specifiedDetect = false; + bool specifiedOutput = false; + bool specifiedOrder = false; /*parse the command line arguments */ for ( int i = 1; i < argc; ){ @@ -71,11 +67,19 @@ Arguments parseSenseArguments( int argc, char** argv ){ std::string strArg( argv[ i + 1 ] ); args.detectFilename = strArg; i+=2; + specifiedDetect = true; } else if ( flag == "-o" or flag == "--output" ){ std::string strArg( argv[ i + 1 ] ); args.outputFilename = strArg; i+=2; + specifiedOutput = true; + } + else if (flag == "--order" ){ + std::string strArg( argv[ i + 1 ] ); + args.analogueOrder = strArg; + i+=2; + specifiedOrder = true; } else if ( flag == "-t" or flag == "--threads" ){ @@ -98,708 +102,1078 @@ Arguments parseSenseArguments( int argc, char** argv ){ args.markForks = true; i+=1; } + else if ( flag == "--markAnalogues" ){ + + args.markAnalogues = true; + i+=1; + } else throw InvalidOption( flag ); } if (args.outputFilename == args.detectFilename) throw OverwriteFailure(); + if (not (specifiedDetect and specifiedOutput and specifiedOrder)){ + std::cout << "Exiting with error. Missing required arguments." << std::endl; + std::cout << help << std::endl; + exit(EXIT_FAILURE); + } + + if (args.analogueOrder != "EdU,BrdU" and args.analogueOrder != "BrdU,EdU"){ + std::cout << "Exiting with error. Analogue order should be EdU,BrdU or BrdU,EdU." << std::endl; + std::cout << help << std::endl; + exit(EXIT_FAILURE); + } return args; } -TF_Tensor *read2tensor(DetectedRead &r, const TensorShape &shape){ +std::string writeForkSenseHeader(forkSenseArgs &args, KMeansResult analougeIncorporation){ + + auto t = std::time(nullptr); + auto tm = *std::localtime(&t); + std::ostringstream oss; + oss << std::put_time(&tm, "%d/%m/%Y %H:%M:%S"); + auto str = oss.str(); + + std::string compute = "CPU"; + + std::string out; + out += "#DetectFile " + args.detectFilename + "\n"; + out += "#Threads " + std::to_string(args.threads) + "\n"; + out += "#Compute " + compute + "\n"; + out += "#SystemStartTime " + str + "\n"; + out += "#Software " + std::string(getExePath()) + "\n"; + out += "#Version " + std::string(VERSION) + "\n"; + out += "#Commit " + std::string(getGitCommit()) + "\n"; + out += "#EstimatedRegionBrdU " + std::to_string(analougeIncorporation.centroid_1) + "\n"; + out += "#EstimatedRegionEdU " + std::to_string(analougeIncorporation.centroid_2) + "\n"; + + return out; +} + - size_t size = r.brduCalls.size(); - //put a check in here for size +std::string writeBedHeader( forkSenseArgs &args){ + + auto t = std::time(nullptr); + auto tm = *std::localtime(&t); + std::ostringstream oss; + oss << std::put_time(&tm, "%d/%m/%Y %H:%M:%S"); + auto str = oss.str(); + + std::string compute = "CPU"; + + std::string out; + out += "#DetectFile " + args.detectFilename + "\n"; + out += "#ForkSenseFile " + args.outputFilename + "\n"; + out += "#AnalogueOrder " + args.analogueOrder + "\n"; + out += "#Threads " + std::to_string(args.threads) + "\n"; + out += "#Compute " + compute + "\n"; + out += "#SystemStartTime " + str + "\n"; + out += "#Software " + std::string(getExePath()) + "\n"; + out += "#Version " + std::string(VERSION) + "\n"; + out += "#Commit " + std::string(getGitCommit()) + "\n"; + + return out; +} - r.generateInput(); - auto output_array = std::make_unique(size); - { - for(size_t i = 0; i < size; i++){ - output_array[i] = r.tensorInput[i]; +std::vector stitchSegmentation(std::vector &primarySegments, std::vector &secondarySegments){ + + std::map connectivity; + std::vector stitchedSegments; + + int segmentStitch = 2000; + + for (size_t i = 0; i < primarySegments.size(); i++){ + + for (size_t j = i+1; j < primarySegments.size(); j++){ + + //segments shouldn't overlap + assert(primarySegments[j].leftmostCoord >= primarySegments[i].rightmostCoord or primarySegments[i].leftmostCoord >= primarySegments[j].rightmostCoord); + + if (primarySegments[j].leftmostCoord - primarySegments[i].rightmostCoord < segmentStitch){ + + //make sure there's not a short EdU segment between + bool interveningSegment = false; + for (size_t k = 0; k < secondarySegments.size(); k++){ + + //segments shouldn't overlap + assert(primarySegments[i].leftmostCoord >= secondarySegments[k].rightmostCoord or secondarySegments[k].leftmostCoord >= primarySegments[i].rightmostCoord); + + if (primarySegments[i].rightmostCoord < secondarySegments[k].leftmostCoord and secondarySegments[k].rightmostCoord < primarySegments[j].leftmostCoord){ + interveningSegment = true; + break; + } + } + + if (not interveningSegment) connectivity[i] = j; + } } } - - auto output = tf_obj_unique_ptr(TF_NewTensor(TF_FLOAT, - shape.values, shape.dim, - (void *)output_array.get(), size*sizeof(float), - cpp_array_deallocator, nullptr)); - - if(output) output_array.release(); - - return output.release(); + + //stitch + std::vector ignoreIndices; + for (size_t i = 0; i < primarySegments.size(); i++){ + + if (std::find(ignoreIndices.begin(), ignoreIndices.end(), i) != ignoreIndices.end()) continue; + + int targetLeft = i; + int startCoord = primarySegments[targetLeft].leftmostCoord; + int startIdx = primarySegments[targetLeft].leftmostIdx; + int endCoord = primarySegments[targetLeft].rightmostCoord; + int endIdx = primarySegments[targetLeft].rightmostIdx; + + while (connectivity.count(targetLeft) > 0){ + + int idxToMerge = connectivity[targetLeft]; + + assert(endCoord < primarySegments[idxToMerge].rightmostCoord); + assert(endIdx < primarySegments[idxToMerge].rightmostIdx); + + endCoord = primarySegments[idxToMerge].rightmostCoord; + endIdx = primarySegments[idxToMerge].rightmostIdx; + ignoreIndices.push_back(idxToMerge); + targetLeft = idxToMerge; + } + + struct ReadSegment s = {startCoord, startIdx, endCoord, endIdx}; + stitchedSegments.push_back(s); + } + return stitchedSegments; } -std::vector pooling = {6,4,4,4}; - +void callSegmentation(DetectedRead &r){ -std::pair callForks(DetectedRead &r){ - - assert(r.positions.size() == r.probabilities.size()); - - float threshold = 0.7; - float threshold_weak = 0.5; - int minLength = 0; - - std::vector> leftForks, leftForks_weak, rightForks, rightForks_weak; - std::string outBedLeft,outBedRight; + int minLength = 1000; - bool inFork = false; - int forkStart = -1, potentialEnd = -1; + bool inSegment = false; + int startCoord = -1, endCoord = -1; + int startIdx = -1, endIdx = -1; + + std::vector EdU_segments; + std::vector BrdU_segments; - //rightward-moving forks - for (size_t i = 1; i < r.probabilities.size(); i++){ + for (size_t i = 0; i < r.positions.size(); i++){ - if (r.probabilities[i][2] > threshold and not inFork){ //initialise the site + if (r.EdU_segment_label[i] == 1 and not inSegment){ //initialise the site - forkStart = r.positions[i]; - inFork = true; + startCoord = r.positions[i]; + startIdx = i; + inSegment = true; } - else if (inFork and r.probabilities[i][2] <= threshold and r.probabilities[i-1][2] >= threshold){//flag as a potential end if we dip below the threshold - - potentialEnd = r.positions[i]; - } - else if (inFork and (r.probabilities[i][0] > threshold or r.probabilities[i][1] > threshold)){//close if we've confidently moved to something else - - assert(forkStart != -1 and potentialEnd != -1); - - if ( abs(potentialEnd - forkStart) >= minLength ){ - rightForks.push_back(std::make_pair(forkStart,potentialEnd)); + else if (inSegment and (r.thymidine_segment_label[i] == 1 or r.BrdU_segment_label[i] == 1)){//close if we've confidently moved to something else + + endCoord = r.positions[i]; + endIdx = i; + + assert(startCoord != -1 and endCoord != -1); + + if ( abs(endCoord - startCoord) >= minLength ){ + + std::pair trim = segmentationTrim(r.positions, r.eduCalls, r.brduCalls, startIdx, endIdx); + startIdx += trim.first; + endIdx -= trim.second; + startCoord = r.positions[startIdx]; + endCoord = r.positions[endIdx]; + + assert(startCoord < endCoord); + + struct ReadSegment s = {startCoord, startIdx, endCoord, endIdx}; + EdU_segments.push_back(s); } - inFork = false; - forkStart = -1; - potentialEnd = -1; + inSegment = false; + startCoord = -1; + endCoord = -1; + startIdx = -1; + endIdx = -1; } } - //if we got to the end of the read without closing - if (inFork){ + if (inSegment){ - assert(forkStart != -1); - if (potentialEnd == -1) potentialEnd = r.positions.back(); + assert(startCoord != -1); + if (endCoord == -1){ + endCoord = r.positions.back(); + endIdx = r.positions.size() - 1; + } - if ( abs(potentialEnd - forkStart) >= minLength ){ - rightForks.push_back(std::make_pair(forkStart,potentialEnd)); + if ( abs(endCoord - startCoord) >= minLength ){ + + std::pair trim = segmentationTrim(r.positions, r.eduCalls, r.brduCalls, startIdx, endIdx); + startIdx += trim.first; + endIdx -= trim.second; + startCoord = r.positions[startIdx]; + endCoord = r.positions[endIdx]; + + assert(startCoord < endCoord); + + struct ReadSegment s = {startCoord, startIdx, endCoord, endIdx}; + EdU_segments.push_back(s); } } - inFork = false; - forkStart = -1; - potentialEnd = -1; + startCoord = -1; + endCoord = -1; + startIdx = -1; + endIdx = -1; + inSegment = false; - //weak call rightward-moving forks - for (size_t i = 1; i < r.probabilities.size(); i++){ + for (size_t i = 0; i < r.positions.size(); i++){ - if (r.probabilities[i][2] > threshold_weak and not inFork){ //initialise the site + if (r.BrdU_segment_label[i] == 1 and not inSegment){ //initialise the site - forkStart = r.positions[i]; - inFork = true; + startCoord = r.positions[i]; + startIdx = i; + inSegment = true; } - else if (inFork and r.probabilities[i][2] >= threshold){//throw it out if it becomes a confident fork call + else if (inSegment and (r.thymidine_segment_label[i] == 1 or r.EdU_segment_label[i] == 1)){//close if we've confidently moved to something else + + endCoord = r.positions[i]; + endIdx = i; + + assert(startCoord != -1 and endCoord != -1); + + if ( abs(endCoord - startCoord) >= minLength ){ + + std::pair trim = segmentationTrim(r.positions, r.brduCalls, r.eduCalls, startIdx, endIdx); + startIdx += trim.first; + endIdx -= trim.second; + startCoord = r.positions[startIdx]; + endCoord = r.positions[endIdx]; + + assert(startCoord < endCoord); + + struct ReadSegment s = {startCoord, startIdx, endCoord, endIdx}; + BrdU_segments.push_back(s); + } - inFork = false; - forkStart = -1; - potentialEnd = -1; + inSegment = false; + startCoord = -1; + endCoord = -1; + startIdx = -1; + endIdx = -1; } - else if (inFork and r.probabilities[i][2] <= threshold_weak and r.probabilities[i-1][2] >= threshold_weak){//flag as a potential end if we dip below the threshold + } + //if we got to the end of the read without closing + if (inSegment){ - potentialEnd = r.positions[i]; + assert(startCoord != -1); + if (endCoord == -1){ + endCoord = r.positions.back(); + endIdx = r.positions.size() - 1; } - else if (inFork and (r.probabilities[i][0] > threshold_weak or r.probabilities[i][1] > threshold_weak)){//close if we've confidently moved to something else - - assert(forkStart != -1 and potentialEnd != -1); - rightForks_weak.push_back(std::make_pair(forkStart,potentialEnd)); - //std::cout << "weak right" << forkStart << " " << potentialEnd << std::endl; - - inFork = false; - forkStart = -1; - potentialEnd = -1; + if ( abs(endCoord - startCoord) >= minLength ){ + + std::pair trim = segmentationTrim(r.positions, r.brduCalls, r.eduCalls, startIdx, endIdx); + startIdx += trim.first; + endIdx -= trim.second; + startCoord = r.positions[startIdx]; + endCoord = r.positions[endIdx]; + + assert(startCoord < endCoord); + + struct ReadSegment s = {startCoord, startIdx, endCoord, endIdx}; + BrdU_segments.push_back(s); } } + + r.BrdU_segment = stitchSegmentation(BrdU_segments, EdU_segments); + r.EdU_segment = stitchSegmentation(EdU_segments, BrdU_segments); +} - inFork = false; - forkStart = -1; - potentialEnd = -1; - - //reverse order for leftward-moving forks - std::vector revPositions(r.positions.rbegin(), r.positions.rend()); - std::vector> revProbabilities(r.probabilities.rbegin(), r.probabilities.rend()); - //leftward-moving forks - for (size_t i = 1; i < revProbabilities.size(); i++){ +std::string callOrigins(DetectedRead &r, forkSenseArgs args){ - if (revProbabilities[i][0] > threshold and not inFork){ //initialise the site + std::string outBed; - forkStart = revPositions[i]; - inFork = true; - } - else if (inFork and revProbabilities[i][0] <= threshold and revProbabilities[i-1][0] >= threshold){//flag as a potential end if we dip below the threshold + //match up regions + for ( size_t li = 0; li < r.leftForks.size(); li++ ){ - potentialEnd = revPositions[i]; - } - else if (inFork and (revProbabilities[i][1] > threshold or revProbabilities[i][2] > threshold)){//close if we've confidently moved to something else + //find the closest right fork region + int minDist = std::numeric_limits::max(); + int bestMatch = -1; + for ( size_t ri = 0; ri < r.rightForks.size(); ri++ ){ - assert(forkStart != -1 and potentialEnd != -1); + if (r.rightForks[ri].rightmostCoord < r.leftForks[li].rightmostCoord) continue; - if ( abs(potentialEnd - forkStart) >= minLength ){ - leftForks.push_back(std::make_pair(potentialEnd,forkStart)); + int dist = r.rightForks[ri].rightmostCoord - r.leftForks[li].leftmostCoord; + assert(dist >= 0); + if (dist < minDist){ + minDist = dist; + bestMatch = ri; } - - inFork = false; - forkStart = -1; - potentialEnd = -1; } - } - - //if we got to the end of the read without closing - if (inFork){ - assert(forkStart != -1); - if (potentialEnd == -1) potentialEnd = revPositions.back(); - - if ( abs(potentialEnd - forkStart) >= minLength ){ - leftForks.push_back(std::make_pair(potentialEnd,forkStart)); - } - } + //make sure no other left forks are closer + bool failed = false; + if (bestMatch != -1){ - inFork = false; - forkStart = -1; - potentialEnd = -1; + for (size_t l2 = 0; l2 < r.leftForks.size(); l2++){ - //weak call leftward-moving forks - for (size_t i = 1; i < revProbabilities.size(); i++){ + if (l2 == li) continue; - if (revProbabilities[i][0] > threshold_weak and not inFork){ //initialise the site + if (r.rightForks[bestMatch].rightmostCoord < r.leftForks[l2].rightmostCoord) continue; - forkStart = revPositions[i]; - inFork = true; - } - else if (inFork and r.probabilities[i][0] >= threshold){//throw it out if it becomes a confident fork call + int dist = r.rightForks[bestMatch].rightmostCoord - r.leftForks[l2].leftmostCoord; + assert(dist >= 0); - inFork = false; - forkStart = -1; - potentialEnd = -1; - } - else if (inFork and revProbabilities[i][0] <= threshold_weak and revProbabilities[i-1][0] >= threshold_weak){//flag as a potential end if we dip below the threshold + if (dist < minDist){ - potentialEnd = revPositions[i]; + failed = true; + break; + } + } } - else if (inFork and (revProbabilities[i][1] > threshold_weak or revProbabilities[i][2] > threshold_weak)){//close if we've confidently moved to something else + if (failed) continue; + else if (bestMatch != -1){ - assert(forkStart != -1 and potentialEnd != -1); + int orilb = std::min(r.leftForks[li].rightmostCoord, r.rightForks[bestMatch].leftmostCoord); + int oriub = std::max(r.leftForks[li].rightmostCoord, r.rightForks[bestMatch].leftmostCoord); + + int orilb_idx = std::min(r.leftForks[li].rightmostIdx, r.rightForks[bestMatch].leftmostIdx); + int oriub_idx = std::max(r.leftForks[li].rightmostIdx, r.rightForks[bestMatch].leftmostIdx); + + struct ReadSegment s = {orilb, orilb_idx, oriub, oriub_idx}; + r.origins.push_back(s); - leftForks_weak.push_back(std::make_pair(potentialEnd,forkStart)); - //std::cout << "weak left" << potentialEnd << " " << forkStart << std::endl; - inFork = false; - forkStart = -1; - potentialEnd = -1; + outBed += r.chromosome + " " + std::to_string(orilb) + " " + std::to_string(oriub) + " " + r.header.substr(1) + "\n"; } } - for (auto lf = leftForks.begin(); lf < leftForks.end(); lf++){ + return outBed; +} - outBedLeft += r.chromosome + " " + std::to_string(lf -> first) + " " + std::to_string(lf -> second) + " " + r.header.substr(1) + "\n"; - } - for (auto rf = rightForks.begin(); rf < rightForks.end(); rf++){ +std::string callTerminations(DetectedRead &r, forkSenseArgs args){ - outBedRight += r.chromosome + " " + std::to_string(rf -> first) + " " + std::to_string(rf -> second) + " " + r.header.substr(1) + "\n"; - } + std::string outBed; - return std::make_pair(outBedLeft,outBedRight); -} + //match up regions + for ( size_t li = 0; li < r.leftForks.size(); li++ ){ + //find the closest right fork region + int minDist = std::numeric_limits::max(); + int bestMatch = -1; + for ( size_t ri = 0; ri < r.rightForks.size(); ri++ ){ -std::string callOrigins(DetectedRead &r){ + if (r.leftForks[li].rightmostCoord < r.rightForks[ri].rightmostCoord ) continue; - assert(r.positions.size() == r.probabilities.size()); + int dist = r.leftForks[li].rightmostCoord - r.rightForks[ri].leftmostCoord; + assert(dist >= 0); - float threshold = 0.7; - float threshold_weak = 0.5; - int minLength = 1000; + if (dist < minDist){ + minDist = dist; + bestMatch = ri; - std::vector> leftForks, leftForks_weak, rightForks, rightForks_weak; - std::string outBed; + } + } - bool inFork = false; - int forkStart = -1, potentialEnd = -1; + //make sure no other left forks are closer + bool failed = false; + if (bestMatch != -1){ - //rightward-moving forks - for (size_t i = 1; i < r.probabilities.size(); i++){ + for (size_t l2 = 0; l2 < r.leftForks.size(); l2++){ - if (r.probabilities[i][2] > threshold and not inFork){ //initialise the site + if (li == l2) continue; - forkStart = r.positions[i]; - inFork = true; - } - else if (inFork and r.probabilities[i][2] <= threshold and r.probabilities[i-1][2] >= threshold){//flag as a potential end if we dip below the threshold + if (r.leftForks[l2].rightmostCoord < r.rightForks[bestMatch].rightmostCoord ) continue; - potentialEnd = r.positions[i]; - } - else if (inFork and (r.probabilities[i][0] > threshold or r.probabilities[i][1] > threshold)){//close if we've confidently moved to something else + int dist = r.leftForks[l2].rightmostCoord - r.rightForks[bestMatch].leftmostCoord; + assert(dist >= 0); - assert(forkStart != -1 and potentialEnd != -1); + if (dist < minDist){ - if ( abs(potentialEnd - forkStart) >= minLength ){ - rightForks.push_back(std::make_pair(forkStart,potentialEnd)); + failed = true; + break; + } } - - inFork = false; - forkStart = -1; - potentialEnd = -1; } - } - - //if we got to the end of the read without closing - if (inFork){ + if (failed) continue; + else if (bestMatch != -1){ - assert(forkStart != -1); - if (potentialEnd == -1) potentialEnd = r.positions.back(); + int termlb = std::min(r.leftForks[li].leftmostCoord,r.rightForks[bestMatch].rightmostCoord); + int termub = std::max(r.leftForks[li].leftmostCoord,r.rightForks[bestMatch].rightmostCoord); + + int termlb_idx = std::min(r.leftForks[li].leftmostIdx,r.rightForks[bestMatch].rightmostIdx); + int termub_idx = std::max(r.leftForks[li].leftmostIdx,r.rightForks[bestMatch].rightmostIdx); + + struct ReadSegment s = {termlb, termlb_idx, termub, termub_idx}; - if ( abs(potentialEnd - forkStart) >= minLength ){ - rightForks.push_back(std::make_pair(forkStart,potentialEnd)); + r.terminations.push_back(s); + outBed += r.chromosome + " " + std::to_string(termlb) + " " + std::to_string(termub) + " " + r.header.substr(1) + "\n"; } } - inFork = false; - forkStart = -1; - potentialEnd = -1; - - //weak call rightward-moving forks - for (size_t i = 1; i < r.probabilities.size(); i++){ - - if (r.probabilities[i][2] > threshold_weak and not inFork){ //initialise the site - - forkStart = r.positions[i]; - inFork = true; - } - else if (inFork and r.probabilities[i][2] >= threshold){//throw it out if it becomes a confident fork call + return outBed; +} - inFork = false; - forkStart = -1; - potentialEnd = -1; - } - else if (inFork and r.probabilities[i][2] <= threshold_weak and r.probabilities[i-1][2] >= threshold_weak){//flag as a potential end if we dip below the threshold - potentialEnd = r.positions[i]; - } - else if (inFork and (r.probabilities[i][0] > threshold_weak or r.probabilities[i][1] > threshold_weak)){//close if we've confidently moved to something else +std::pair writeAnalogueRegions(DetectedRead &r){ - assert(forkStart != -1 and potentialEnd != -1); + std::string outBedEdU, outBedBrdU; - rightForks_weak.push_back(std::make_pair(forkStart,potentialEnd)); - //std::cout << "weak right" << forkStart << " " << potentialEnd << std::endl; + for (auto i = r.BrdU_segment.begin(); i < r.BrdU_segment.end(); i++){ - inFork = false; - forkStart = -1; - potentialEnd = -1; - } + outBedBrdU += r.chromosome + " " + std::to_string(i -> leftmostCoord) + " " + std::to_string(i -> rightmostCoord) + " " + r.header.substr(1) + "\n"; } + for (auto i = r.EdU_segment.begin(); i < r.EdU_segment.end(); i++){ - inFork = false; - forkStart = -1; - potentialEnd = -1; + outBedEdU += r.chromosome + " " + std::to_string(i -> leftmostCoord) + " " + std::to_string(i -> rightmostCoord) + " " + r.header.substr(1) + "\n"; + } - //reverse order for leftward-moving forks - std::vector revPositions(r.positions.rbegin(), r.positions.rend()); - std::vector> revProbabilities(r.probabilities.rbegin(), r.probabilities.rend()); + return std::make_pair(outBedBrdU,outBedEdU); +} - //leftward-moving forks - for (size_t i = 1; i < revProbabilities.size(); i++){ - if (revProbabilities[i][0] > threshold and not inFork){ //initialise the site +std::pair callForks(DetectedRead &r, std::string analogueOrder){ - forkStart = revPositions[i]; - inFork = true; - } - else if (inFork and revProbabilities[i][0] <= threshold and revProbabilities[i-1][0] >= threshold){//flag as a potential end if we dip below the threshold + //maximum distance in bp between EdU and BrdU segments that allow them to be matched + int maxGap = 5000; + + std::vector analogue1_segments, analogue2_segments; + + if (analogueOrder == "EdU,BrdU"){ - potentialEnd = revPositions[i]; - } - else if (inFork and (revProbabilities[i][1] > threshold or revProbabilities[i][2] > threshold)){//close if we've confidently moved to something else + analogue1_segments = r.EdU_segment; + analogue2_segments = r.BrdU_segment; + } + else{ - assert(forkStart != -1 and potentialEnd != -1); + analogue2_segments = r.EdU_segment; + analogue1_segments = r.BrdU_segment; + } - if ( abs(potentialEnd - forkStart) >= minLength ){ - leftForks.push_back(std::make_pair(potentialEnd,forkStart)); - } + std::vector> proto_rightForkPairs, proto_leftForkPairs; - inFork = false; - forkStart = -1; - potentialEnd = -1; - } - } + //match up segments - right fork + for ( size_t li = 0; li < analogue1_segments.size(); li++ ){ - //if we got to the end of the read without closing - if (inFork){ + //find the closest analogue 2 patch + int minDist = std::numeric_limits::max(); + int bestMatch = -1; + for ( size_t ri = 0; ri < analogue2_segments.size(); ri++ ){ - assert(forkStart != -1); - if (potentialEnd == -1) potentialEnd = revPositions.back(); + if (analogue2_segments[ri].leftmostCoord < analogue1_segments[li].rightmostCoord) continue; - if ( abs(potentialEnd - forkStart) >= minLength ){ - leftForks.push_back(std::make_pair(potentialEnd,forkStart)); + int dist = analogue2_segments[ri].leftmostCoord - analogue1_segments[li].rightmostCoord; + assert(dist >= 0); + + if (dist < minDist){ + minDist = dist; + bestMatch = ri; + } } - } - inFork = false; - forkStart = -1; - potentialEnd = -1; + //make sure no other analogue 1 patches are closer + bool failed = false; + if (bestMatch != -1){ - //weak call leftward-moving forks - for (size_t i = 1; i < revProbabilities.size(); i++){ + for (size_t l2 = 0; l2 < analogue1_segments.size(); l2++){ - if (revProbabilities[i][0] > threshold_weak and not inFork){ //initialise the site + if (li == l2) continue; - forkStart = revPositions[i]; - inFork = true; - } - else if (inFork and r.probabilities[i][0] >= threshold){//throw it out if it becomes a confident fork call + if (analogue2_segments[bestMatch].leftmostCoord < analogue1_segments[l2].rightmostCoord) continue; - inFork = false; - forkStart = -1; - potentialEnd = -1; - } - else if (inFork and revProbabilities[i][0] <= threshold_weak and revProbabilities[i-1][0] >= threshold_weak){//flag as a potential end if we dip below the threshold + int dist = analogue2_segments[bestMatch].leftmostCoord - analogue1_segments[l2].rightmostCoord; + assert(dist >= 0); + + if (dist < minDist){ - potentialEnd = revPositions[i]; + failed = true; + break; + } + } } - else if (inFork and (revProbabilities[i][1] > threshold_weak or revProbabilities[i][2] > threshold_weak)){//close if we've confidently moved to something else - - assert(forkStart != -1 and potentialEnd != -1); - leftForks_weak.push_back(std::make_pair(potentialEnd,forkStart)); - //std::cout << "weak left" << potentialEnd << " " << forkStart << std::endl; - inFork = false; - forkStart = -1; - potentialEnd = -1; + if (failed) continue; + else if (bestMatch != -1 and minDist < maxGap){ + + assert(analogue1_segments[li].leftmostCoord < analogue2_segments[bestMatch].rightmostCoord); + + analogue1_segments[li].partners++; + analogue2_segments[bestMatch].partners++; + + proto_rightForkPairs.push_back(std::make_pair(li,bestMatch)); } } - //match up regions - for ( size_t li = 0; li < leftForks.size(); li++ ){ + //match up segments - left fork + for ( size_t li = 0; li < analogue1_segments.size(); li++ ){ - //find the closest right fork region + //find the closest analogue 2 patch int minDist = std::numeric_limits::max(); int bestMatch = -1; - for ( size_t ri = 0; ri < rightForks.size(); ri++ ){ + for ( size_t ri = 0; ri < analogue2_segments.size(); ri++ ){ - if (leftForks[li].second > rightForks[ri].first ) continue; + if (analogue1_segments[li].leftmostCoord < analogue2_segments[ri].rightmostCoord) continue; + + int dist = analogue1_segments[li].leftmostCoord - analogue2_segments[ri].rightmostCoord; + assert(dist >= 0); - int dist = rightForks[ri].first - leftForks[li].second; if (dist < minDist){ minDist = dist; bestMatch = ri; } } - //make sure no other left forks are closer + //make sure no other analogue 2 patches are closer bool failed = false; if (bestMatch != -1){ - for (size_t l2 = 0; l2 < leftForks.size(); l2++){ + for (size_t l2 = 0; l2 < analogue1_segments.size(); l2++){ - if (l2 == li) continue; - - if (leftForks[l2].second > rightForks[bestMatch].first ) continue; + if (li == l2) continue; - int dist = rightForks[bestMatch].first - leftForks[l2].second; + if (analogue1_segments[l2].leftmostCoord < analogue2_segments[bestMatch].rightmostCoord) continue; + int dist = analogue1_segments[l2].leftmostCoord - analogue2_segments[bestMatch].rightmostCoord; + assert(dist >= 0); + if (dist < minDist){ - failed = true; break; } } } if (failed) continue; - else if (bestMatch != -1){ - - //make sure there are no weak fork calls in between matching left and right forks - bool abort = false; - for ( size_t lw = 0; lw < leftForks_weak.size(); lw++){ - if (leftForks_weak[lw].first > leftForks[li].second and leftForks_weak[lw].second < rightForks[bestMatch].first){ - abort = true; - //std::cout << leftForks_weak[lw].first << " " << leftForks_weak[lw].second << std::endl; - break; - } - } - if (not abort){ - for ( size_t rw = 0; rw < rightForks_weak.size(); rw++){ - if (rightForks_weak[rw].first > leftForks[li].second and rightForks_weak[rw].second < rightForks[bestMatch].first){ - //std::cout << rightForks_weak[rw].first << " " << rightForks_weak[rw].second << std::endl; - abort = true; - break; - } - } - } + else if (bestMatch != -1 and minDist < maxGap){ + + assert(analogue2_segments[bestMatch].leftmostCoord < analogue1_segments[li].rightmostCoord); + + analogue2_segments[bestMatch].partners++; + analogue1_segments[li].partners++; + + proto_leftForkPairs.push_back(std::make_pair(bestMatch,li)); + } + } + + //make fork bounds, write to bed files + std::string outBedLeft, outBedRight; + for (auto p = proto_rightForkPairs.begin(); p < proto_rightForkPairs.end(); p++){ + + //left coordinate and index + assert(analogue1_segments[p -> first].partners == 1 or analogue1_segments[p -> first].partners == 2); + int lc = analogue1_segments[p -> first].leftmostCoord; + int li = analogue1_segments[p -> first].leftmostIdx; + if (analogue1_segments[p -> first].partners == 2){ + + lc = (analogue1_segments[p -> first].leftmostCoord + analogue1_segments[p -> first].rightmostCoord) / 2; + li = (analogue1_segments[p -> first].leftmostIdx + analogue1_segments[p -> first].rightmostIdx) / 2; + } - //if we didn't find any problematic weak fork calls between the matching left and right forks, make the call - if (not abort){ - r.origins.push_back(std::make_pair(leftForks[li].second, rightForks[bestMatch].first)); - outBed += r.chromosome + " " + std::to_string(leftForks[li].second) + " " + std::to_string(rightForks[bestMatch].first) + " " + r.header.substr(1) + "\n"; - } + //right coordinate and index + int rc = analogue2_segments[p -> second].rightmostCoord; + int ri = analogue2_segments[p -> second].rightmostIdx; + if (analogue2_segments[p -> second].partners == 2){ + + rc = (analogue2_segments[p -> second].rightmostCoord + analogue2_segments[p -> second].leftmostCoord) / 2; + ri = (analogue2_segments[p -> second].rightmostIdx + analogue2_segments[p -> second].leftmostIdx) / 2; } + + struct ReadSegment s = {lc, li, rc, ri}; + r.rightForks.push_back(s); + outBedRight += r.chromosome + " " + std::to_string(lc) + " " + std::to_string(rc) + " " + r.header.substr(1) + "\n"; + } + for (auto p = proto_leftForkPairs.begin(); p < proto_leftForkPairs.end(); p++){ + + //left coordinate and index + assert(analogue2_segments[p -> first].partners == 1 or analogue2_segments[p -> first].partners == 2); + int lc = analogue2_segments[p -> first].leftmostCoord; + int li = analogue2_segments[p -> first].leftmostIdx; + if (analogue2_segments[p -> first].partners == 2){ + + lc = (analogue2_segments[p -> first].leftmostCoord + analogue2_segments[p -> first].rightmostCoord) / 2; + li = (analogue2_segments[p -> first].leftmostIdx + analogue2_segments[p -> first].rightmostIdx) / 2; + } + + //right coordinate and index + int rc = analogue1_segments[p -> second].rightmostCoord; + int ri = analogue1_segments[p -> second].rightmostIdx; + if (analogue1_segments[p -> second].partners == 2){ + + rc = (analogue1_segments[p -> second].rightmostCoord + analogue1_segments[p -> second].leftmostCoord) / 2; + ri = (analogue1_segments[p -> second].rightmostIdx + analogue1_segments[p -> second].leftmostIdx) / 2; + } + + struct ReadSegment s = {lc, li, rc, ri}; + r.leftForks.push_back(s); + outBedLeft += r.chromosome + " " + std::to_string(lc) + " " + std::to_string(rc) + " " + r.header.substr(1) + "\n"; } - return outBed; + return std::make_pair(outBedLeft, outBedRight); } -std::string callTerminations(DetectedRead &r){ +std::pair< std::vector, int > findNeighbours_mod( std::vector &positions, std::vector< double > &calls, std::vector< double > &altCalls, int index, int epsilon ){ + + std::vector< int > neighbourIdx; + int positiveCalls = 0; + int positiveAltCalls = 0; + int ev = positions[index]; + + int windowStart = index-epsilon; + int windowEnd = index+epsilon; + int numPositions = positions.size(); + int startIdx = std::max(windowStart, 0); + int endIdx = std::min(windowEnd, numPositions-1); + + for ( int i = startIdx; i <= endIdx; i++ ){ + int runningPos = positions[i]; + int gap = std::abs(ev - runningPos); + + if (gap <= epsilon){ + neighbourIdx.push_back(i); + if (calls[i] > 0.5){ + positiveCalls++; + } + if (altCalls[i] > 0.5){ + positiveAltCalls++; + } + } + } + + int delta = positiveCalls - positiveAltCalls; + int netPositiveCalls = std::max(0,delta); + + return std::make_pair(neighbourIdx, netPositiveCalls); +} - assert(r.positions.size() == r.probabilities.size()); - float threshold = 0.8; - int minLength = 1000; +std::map DBSCAN_mod( std::vector< int > &positions, std::vector< double > &calls, std::vector< double > &altCalls, int epsilon, double minDensity ){ - std::vector> leftForks, rightForks; - std::string outBed; + //labels + //-2 := undefined + //-1 := noise + // 0 <= cluster int - bool inFork = false; - int forkStart = -1, potentialEnd = -1; + //initialise labels + std::map< int, int > index2label; + for ( size_t i = 0; i < positions.size(); i++ ) index2label[i] = -2; - //rightward-moving forks - for (size_t i = 1; i < r.probabilities.size(); i++){ + int C = 0; //cluster counter + for ( size_t i = 0; i < positions.size(); i++ ){ - if (r.probabilities[i][2] > threshold and not inFork){ //initialise the site + if (index2label[i] != -2) continue; + std::pair, int> neighbourPair = findNeighbours_mod( positions, calls, altCalls, i, epsilon ); + std::vector neighbourIndices = neighbourPair.first; + int neighbourCalls = neighbourPair.second; + int minPoints = neighbourIndices.size() * minDensity; + if (neighbourCalls < minPoints) { - forkStart = r.positions[i]; - inFork = true; + index2label[i] = -1; //label as noise + continue; } - else if (inFork and r.probabilities[i][2] <= threshold and r.probabilities[i-1][2] >= threshold){ - potentialEnd = r.positions[i]; + C++; //increment the cluster + index2label[i] = C; + std::vector< int > seedSet = neighbourIndices; + seedSet.erase(std::find(seedSet.begin(),seedSet.end(),i)); //seed set is the neighbours minus the event we're at + for ( size_t j = 0; j < seedSet.size(); j++ ){ + + if (index2label[seedSet[j]] == -1) index2label[seedSet[j]] = C; + if (index2label[seedSet[j]] != -2 ) continue; + index2label[seedSet[j]] = C; + std::pair, int> neighbourPair_Inner = findNeighbours_mod(positions, calls, altCalls, seedSet[j], epsilon); + std::vector neighbourIndicesInner = neighbourPair_Inner.first; + int neighbourCallsInner = neighbourPair_Inner.second; + minPoints = neighbourIndicesInner.size() * minDensity; + if (neighbourCallsInner >= minPoints){ + + seedSet.insert(seedSet.end(),neighbourIndicesInner.begin(),neighbourIndicesInner.end()); + } } - else if (inFork and (r.probabilities[i][0] > threshold or r.probabilities[i][1] > threshold)){//close if we've confidently moved to something else - - assert(forkStart != -1 and potentialEnd != -1); + } + return index2label; +} - if ( abs(potentialEnd - forkStart) >= minLength ){ - rightForks.push_back(std::make_pair(forkStart,potentialEnd)); - } - inFork = false; - forkStart = -1; - potentialEnd = -1; +void runDBSCAN(DetectedRead &r, KMeansResult analougeIncorporation){ + + int epsilon = 1000; + + double minBrdUDensity = std::max(0.1,analougeIncorporation.centroid_1 - analougeIncorporation.centroid_1_stdv); + double minEdUDensity = std::max(0.1,analougeIncorporation.centroid_2 - analougeIncorporation.centroid_2_stdv); + + std::map eduLabels = DBSCAN_mod( r.positions, r.eduCalls, r.brduCalls, epsilon, minEdUDensity ); + std::map brduLabels = DBSCAN_mod( r.positions, r.brduCalls, r.eduCalls, epsilon, minBrdUDensity ); + + for (size_t i = 0; i < r.positions.size(); i++){ + + int eduLabel = 0; + int brduLabel = 0; + int thymLabel = 0; + int contestedLabel = 0; + + if (eduLabels[ i ] >= 0 and brduLabels[ i ] < 0){ + eduLabel = 1; + brduLabel = 0; + thymLabel = 0; + contestedLabel = 0; + } + else if (brduLabels[ i ] >= 0 and eduLabels[ i ] < 0){ + eduLabel = 0; + brduLabel = 1; + thymLabel = 0; + contestedLabel = 0; + } + else if (brduLabels[ i ] < 0 and eduLabels[ i ] < 0){ + eduLabel = 0; + brduLabel = 0; + thymLabel = 1; + contestedLabel = 0; } + else if (brduLabels[ i ] >= 0 and eduLabels[ i ] >= 0){ + eduLabel = 0; + brduLabel = 0; + thymLabel = 0; + contestedLabel = 1; + } + + r.EdU_segment_label.push_back(eduLabel); + r.BrdU_segment_label.push_back(brduLabel); + r.thymidine_segment_label.push_back(thymLabel); + r.contested_segment_label.push_back(contestedLabel); } +} - //if we got to the end of the read without closing - if (inFork){ - assert(forkStart != -1); - if (potentialEnd == -1) potentialEnd = r.positions.back(); +std::pair segmentationTrim(std::vector< int > &positions, std::vector< double > &calls, std::vector< double > &altCalls, int startIdx, int endIdx){ - if ( abs(potentialEnd - forkStart) >= minLength ){ - rightForks.push_back(std::make_pair(forkStart,potentialEnd)); + int epsilon = 500; //500 bp window + + if (positions[endIdx] - positions[startIdx] < 3*epsilon){ + + return std::make_pair(0,0); + } + + std::vector< int > segmentPositions(positions.begin() + startIdx, positions.begin() + endIdx + 1); + std::vector< double > segmentCalls(calls.begin() + startIdx, calls.begin() + endIdx + 1); + std::vector< double > segmentAltCalls(altCalls.begin() + startIdx, altCalls.begin() + endIdx + 1); + + std::vector segmentDensities; + int maxCallsInd = segmentCalls.size(); + for (int i = 0; i < maxCallsInd; i++){ + + int positiveCalls = 0; + int attempts = 0; + int lb = std::max(0,i-epsilon); + int segmentIdxLen = segmentCalls.size(); + int ub = std::min(segmentIdxLen, i+epsilon); + + for (int j = lb; j < ub; j++){ + + int signedGap = segmentPositions[i]-segmentPositions[j]; + if (std::abs(signedGap) < epsilon){ + + if (segmentCalls[j] > 0.5) positiveCalls++; + if (segmentAltCalls[j] > 0.5) positiveCalls--; + attempts++; + } } + segmentDensities.push_back((double)positiveCalls / (double) attempts); + } + + + double minDensity = vectorMean(segmentDensities); + + std::map labels = DBSCAN_mod( segmentPositions, segmentCalls, segmentAltCalls, epsilon, minDensity ); + + int trimFromLeft = 0; + int maxPosInd = segmentPositions.size(); + for (int i = 0; i < maxPosInd; i++){ + + if (labels.at(i) < 0) trimFromLeft++; + else break; + } + + int trimFromRight = 0; + for (int i = maxPosInd - 1; i > 0; i--){ + + if (labels.at(i) < 0) trimFromRight++; + else break; } - inFork = false; - forkStart = -1; - potentialEnd = -1; + return std::make_pair(trimFromLeft, trimFromRight); +} - //reverse order for leftward-moving forks - std::vector revPositions(r.positions.rbegin(), r.positions.rend()); - std::vector> revProbabilities(r.probabilities.rbegin(), r.probabilities.rend()); - //leftward-moving forks - for (size_t i = 1; i < revProbabilities.size(); i++){ +void emptyBuffer(std::vector< DetectedRead > &buffer, forkSenseArgs args, fs_fileManager &fm, KMeansResult analogueIncorporation){ - if (revProbabilities[i][0] > threshold and not inFork){ //initialise the site + #pragma omp parallel for schedule(dynamic) shared(args, analogueIncorporation) num_threads(args.threads) + for ( auto b = buffer.begin(); b < buffer.end(); b++) { - forkStart = revPositions[i]; - inFork = true; + runDBSCAN(*b,analogueIncorporation); + callSegmentation(*b); + + std::vector eduSegment_output( (b -> positions).size(), 0); + std::vector brduSegment_output( (b -> positions).size(), 0); + + //fix the analogue segment indices + for (auto s = (*b).EdU_segment.begin(); s < (*b).EdU_segment.end(); s++){ + + for (int i = (*s).leftmostIdx; i <= (*s).rightmostIdx; i++) eduSegment_output[i] = 1; } - else if (inFork and revProbabilities[i][0] <= threshold and revProbabilities[i-1][0] >= threshold){ - - potentialEnd = revPositions[i]; + for (auto s = (*b).BrdU_segment.begin(); s < (*b).BrdU_segment.end(); s++){ + + for (int i = (*s).leftmostIdx; i <= (*s).rightmostIdx; i++) brduSegment_output[i] = 1; } - else if (inFork and (revProbabilities[i][1] > threshold or revProbabilities[i][2] > threshold)){//close if we've confidently moved to something else + + std::string termOutput, originOutput, leftForkOutput, rightForkOutput, BrdUOutput, EdUOutput; + + if (args.markOrigins or args.markTerms or args.markForks){ - assert(forkStart != -1 and potentialEnd != -1); + std::pair forkOutputPair = callForks(*b, args.analogueOrder); + leftForkOutput = forkOutputPair.first; + rightForkOutput = forkOutputPair.second; - if ( abs(potentialEnd - forkStart) >= minLength ){ - leftForks.push_back(std::make_pair(potentialEnd,forkStart)); + if (args.markOrigins){ + + originOutput = callOrigins(*b,args); + } + if (args.markTerms){ + + termOutput = callTerminations(*b,args); } + } + if (args.markAnalogues){ - inFork = false; - forkStart = -1; - potentialEnd = -1; + std::pair analogueOutputPair = writeAnalogueRegions(*b); + BrdUOutput = analogueOutputPair.first; + EdUOutput = analogueOutputPair.second; + } + + //only output segmentation on non-trivial reads that have at least one analogue segment called on them + std::string readOutput; + if ( (*b).EdU_segment.size() > 0 or (*b).BrdU_segment.size() > 0){ + + //write the read header + readOutput += (*b).readID + " " + (*b).chromosome + " " + std::to_string((*b).mappingLower) + " " + std::to_string((*b).mappingUpper) + " " + (*b).strand + "\n"; //header + + for (size_t i = 0; i < (*b).positions.size(); i++){ + + readOutput += std::to_string((*b).positions[i]) + "\t" + std::to_string(eduSegment_output[i]) + "\t" + std::to_string(brduSegment_output[i]) + "\n"; + } + } + #pragma omp critical + { + fm.writeOutput(readOutput, termOutput, originOutput, leftForkOutput, rightForkOutput, BrdUOutput, EdUOutput); } } + buffer.clear(); +} - //if we got to the end of the read without closing - if (inFork){ - assert(forkStart != -1); - if (potentialEnd == -1) potentialEnd = revPositions.back(); +KMeansResult twoMeans_fs( std::vector< double > &observations ){ - if ( abs(potentialEnd - forkStart) >= minLength ){ - leftForks.push_back(std::make_pair(potentialEnd,forkStart)); - } + double C1_old = 0.01; + double C2_old = 0.5; + double C1_new = C1_old; + double C2_new = C2_old; + double tol = 0.0001; + int maxIter = 100; + int iter = 0; + + std::vector C1_points_old; + std::vector C2_points_old; + + //make an initial assignment + for ( size_t i = 0; i < observations.size(); i++ ){ + + if ( std::abs(observations[i] - C1_old) < std::abs(observations[i] - C2_old) ) C1_points_old.push_back(observations[i]); + else C2_points_old.push_back(observations[i]); } - //match up regions - for ( size_t li = 0; li < leftForks.size(); li++ ){ + //iterate until tolerance is met + do{ + C1_old = C1_new; + C2_old = C2_new; - //find the closest right fork region - int minDist = std::numeric_limits::max(); - int bestMatch = -1; - for ( size_t ri = 0; ri < rightForks.size(); ri++ ){ + std::vector C1_points_new; + std::vector C2_points_new; - if (leftForks[li].first < rightForks[ri].second ) continue; + for ( size_t i = 0; i < C1_points_old.size(); i++ ){ - int dist = leftForks[li].first - rightForks[ri].second; - if (dist < minDist){ - minDist = dist; - bestMatch = ri; + if ( std::abs(C1_points_old[i] - C1_old) < std::abs(C1_points_old[i] - C2_old) ) C1_points_new.push_back(C1_points_old[i]); + else C2_points_new.push_back(C1_points_old[i]); + } - } - } + for ( size_t i = 0; i < C2_points_old.size(); i++ ){ - //make sure no other left forks are closer - bool failed = false; - if (bestMatch != -1){ + if ( std::abs(C2_points_old[i] - C1_old) < std::abs(C2_points_old[i] - C2_old) ) C1_points_new.push_back(C2_points_old[i]); + else C2_points_new.push_back(C2_points_old[i]); + } - for (size_t l2 = 0; l2 < leftForks.size(); l2++){ + C1_new = vectorMean(C1_points_new); + C2_new = vectorMean(C2_points_new); - if (li == l2) continue; + C1_points_old = C1_points_new; + C2_points_old = C2_points_new; - if (leftForks[l2].first < rightForks[bestMatch].second ) continue; + iter++; + }while (iter < maxIter and (std::abs(C1_old - C1_new)>tol or std::abs(C2_old - C2_new)>tol)); + + //compute standard deviations + double C1_stdv = vectorStdv( C1_points_old, C1_new ); + double C2_stdv = vectorStdv( C2_points_old, C2_new ); + + KMeansResult out = {C1_new, C1_stdv, C2_new, C2_stdv}; - int dist = leftForks[l2].first - rightForks[bestMatch].second; - if (dist < minDist){ + return out; +} - failed = true; - break; - } - } + +int parseDetectLine(std::string line, + int &BrdUcalls, + int &EdUcalls, + int &attempts){ + + std::string column; + std::stringstream ssLine(line); + int position = -1, cIndex = 0; + AnalogueScore B, E; + while ( std::getline( ssLine, column, '\t' ) ){ + + if ( cIndex == 0 ){ + + position = std::stoi(column); } - if (failed) continue; - else if (bestMatch != -1){ + else if ( cIndex == 1 ){ - r.terminations.push_back(std::make_pair(rightForks[li].second, leftForks[bestMatch].first)); - outBed += r.chromosome + " " + std::to_string(rightForks[bestMatch].second) + " " + std::to_string(leftForks[li].first) + " " + r.header.substr(1) + "\n"; + E.set(std::stof(column)); } + else if ( cIndex == 2 ){ + + B.set(std::stof(column)); + } + cIndex++; } + assert(position != -1); - return outBed; + + if ( B.get() > 0.5 ){ + BrdUcalls++; + attempts++; + } + else if ( E.get() > 0.5 ){ + EdUcalls++; + attempts++; + } + else{ + attempts++; + } + return position; } -std::string runCNN(DetectedRead &r, std::shared_ptr session){ +KMeansResult estimateAnalogueIncorporation(std::string detectFilename, int readCount){ - TensorShape input_shape={{1, (int64_t)r.brduCalls.size(), 1}, 3}; - auto input_values = tf_obj_unique_ptr(read2tensor(r, input_shape)); - if(!input_values){ - std::cerr << "Tensor creation failure." << std::endl; - exit (EXIT_FAILURE); - } + int readCap = readCount; - CStatus status; - TF_Tensor* inputs[]={input_values.get()}; - TF_Tensor* outputs[1]={}; - TF_SessionRun(*(session->session.get()), nullptr, - &session->inputs, inputs, 1, - &session->outputs, outputs, 1, - nullptr, 0, nullptr, status.ptr); - auto _output_holder = tf_obj_unique_ptr(outputs[0]); + std::ifstream inFile( detectFilename ); + if ( not inFile.is_open() ) throw IOerror( detectFilename ); - if(status.failure()){ - status.dump_error(); - exit (EXIT_FAILURE); - } + std::cout << "Estimating analogue incorporation..." << std::endl; - TF_Tensor &output = *outputs[0]; - if(TF_TensorType(&output) != TF_FLOAT){ - std::cerr << "Error, unexpected output tensor type." << std::endl; - exit(EXIT_FAILURE); - } + std::vector< double > BrdU_callFractions, EdU_callFractions; + + int resolution = 2000; //look in 2 kb segments + + int startingPos = -1; + int progress = 0; + std::string line; + + progressBar pb(readCap,false); + + int BrdUcalls = 0; + int EdUcalls = 0; + int attempts = 0; + int gap = 0; + + while( std::getline( inFile, line ) ){ - std::string str_output; - unsigned int outputFields = 3; - { - str_output += r.readID + " " + r.chromosome + " " + std::to_string(r.mappingLower) + " " + std::to_string(r.mappingUpper) + " " + r.strand + "\n"; //header - size_t output_size = TF_TensorByteSize(&output) / sizeof(float); - assert(output_size == r.brduCalls.size() * outputFields); - auto output_array = (const float *)TF_TensorData(&output); - unsigned int pos = 0; - str_output += std::to_string(r.positions[0]); - for(size_t i = 0; i < output_size; i++){ - if((i+1)%outputFields==0){ - r.probabilities.push_back({output_array[i-2],output_array[i-1],output_array[i]}); - str_output += "\t" + std::to_string(output_array[i-2]) + "\t" + std::to_string(output_array[i]) + "\n"; - pos++; - if (i != output_size-1) str_output += std::to_string(r.positions[pos]); - } + if (line.substr(0,1) == "#") continue; //ignore header + if ( line.substr(0,1) == ">" ){ + + progress++; + pb.displayProgress( progress, 0, 0 ); + + if (progress >= readCap) break; + + BrdUcalls = 0, EdUcalls = 0, attempts = 0, gap = 0, startingPos = -1; + continue; } - } - return str_output; -} + int position = parseDetectLine(line, BrdUcalls, EdUcalls, attempts); -void emptyBuffer(std::vector< DetectedRead > &buffer, Arguments args, std::shared_ptr session, std::ofstream &outFile, std::ofstream &originFile, std::ofstream &termFile, std::ofstream &leftForkFile, std::ofstream &rightForkFile, int trimFactor){ + if (position == -1) continue; - #pragma omp parallel for schedule(dynamic) shared(args, outFile, session) num_threads(args.threads) - for ( auto b = buffer.begin(); b < buffer.end(); b++) { + if ( startingPos == -1 ) startingPos = position; + gap = position - startingPos; - b -> trim(trimFactor); - std::string readOutput = runCNN(*b, session); + if ( gap > resolution and attempts >= resolution / 10 ){ - std::string termOutput, originOutput, leftForkOutput, rightForkOutput; - if (args.markOrigins){ + double BrdU_frac = (double) BrdUcalls / (double) attempts; + BrdU_callFractions.push_back( BrdU_frac ); - originOutput = callOrigins(*b); - } - if (args.markTerms){ - termOutput = callTerminations(*b); - } - if (args.markForks){ - std::pair forkOutputPair = callForks(*b); - leftForkOutput = forkOutputPair.first; - rightForkOutput = forkOutputPair.second; - } + double EdU_frac = (double) EdUcalls / (double) attempts; + EdU_callFractions.push_back( EdU_frac ); - #pragma omp critical - { - outFile << readOutput; - if (args.markTerms and (*b).terminations.size() > 0) termFile << termOutput; - if (args.markOrigins and (*b).origins.size() > 0) originFile << originOutput; - if (args.markForks and leftForkOutput.size() > 0) leftForkFile << leftForkOutput; - if (args.markForks and rightForkOutput.size() > 0) rightForkFile << rightForkOutput; + BrdUcalls = 0, EdUcalls = 0, attempts = 0, gap = 0, startingPos = -1; } } - buffer.clear(); -} - + std::cout << std::endl << "Done." << std::endl; -bool checkReadLength( int length ){ + KMeansResult BrdU_KMeans = twoMeans_fs( BrdU_callFractions ); + + double BrdU_p; + double BrdU_stdv; + + if (BrdU_KMeans.centroid_1 > BrdU_KMeans.centroid_2){ + BrdU_p = BrdU_KMeans.centroid_1; + BrdU_stdv = BrdU_KMeans.centroid_1_stdv; + } + else{ + BrdU_p = BrdU_KMeans.centroid_2; + BrdU_stdv = BrdU_KMeans.centroid_2_stdv; + } + + + KMeansResult EdU_KMeans = twoMeans_fs( EdU_callFractions ); + + double EdU_p; + double EdU_stdv; + + if (EdU_KMeans.centroid_1 > EdU_KMeans.centroid_2){ + EdU_p = EdU_KMeans.centroid_1; + EdU_stdv = EdU_KMeans.centroid_1_stdv; + } + else{ + EdU_p = EdU_KMeans.centroid_2; + EdU_stdv = EdU_KMeans.centroid_2_stdv; + } - for (auto p = pooling.begin(); p < pooling.end(); p++){ + std::cerr << "Estimated fraction of BrdU substitution in BrdU-positive regions: " << BrdU_p << std::endl; + std::cerr << "Estimated BrdU substitution stdv in BrdU-positive regions: " << BrdU_stdv << std::endl; + std::cerr << "Estimated fraction of EdU substitution in EdU-positive regions: " << EdU_p << std::endl; + std::cerr << "Estimated EdU substitution stdv in EdU-positive regions: " << EdU_stdv << std::endl; - length /= *p; - } - if (length <= 3) return false; - else return true; + inFile.close(); + + KMeansResult out = {BrdU_p, BrdU_stdv, EdU_p, EdU_stdv}; + + return out; } int sense_main( int argc, char** argv ){ - Arguments args = parseSenseArguments( argc, argv ); + forkSenseArgs args = parseSenseArguments( argc, argv ); unsigned int maxBufferSize = 20*(args.threads); - //get the model - std::string pathExe = getExePath(); - std::string modelPath = pathExe + "/dnn_models/" + "forkSense.pb"; - std::shared_ptr session = model_load_cpu(modelPath.c_str(), "conv1d_input", "time_distributed_1/Reshape_1", args.threads); - //get a read count int readCount = 0; std::string line; @@ -811,40 +1185,18 @@ int sense_main( int argc, char** argv ){ } progressBar pb(readCount,true); inFile.close(); + + //estimate analogue incorporation + KMeansResult analogueIncorporation = estimateAnalogueIncorporation(args.detectFilename, readCount); //open all the files inFile.open( args.detectFilename ); if ( not inFile.is_open() ) throw IOerror( args.detectFilename ); std::ofstream outFile( args.outputFilename ); - if ( not outFile.is_open() ) throw IOerror( args.outputFilename ); - std::ofstream originFile, termFile, leftForkFile, rightForkFile; - if (args.markTerms){ - - termFile.open("terminations_DNAscent_forkSense.bed"); - if ( not termFile.is_open() ) throw IOerror( "terminations_DNAscent_forkSense.bed" ); - } - if (args.markOrigins){ - - originFile.open("origins_DNAscent_forkSense.bed"); - if ( not originFile.is_open() ) throw IOerror( "origins_DNAscent_forkSense.bed" ); - } - if (args.markForks){ - - leftForkFile.open("leftForks_DNAscent_forkSense.bed"); - if ( not leftForkFile.is_open() ) throw IOerror( "leftForks_DNAscent_forkSense.bed" ); - - rightForkFile.open("rightForks_DNAscent_forkSense.bed"); - if ( not rightForkFile.is_open() ) throw IOerror( "rightForks_DNAscent_forkSense.bed" ); - } - - //write the outfile header - std::string outHeader = writeForkSenseHeader(args.detectFilename, args.threads);; - outFile << outHeader; + + fs_fileManager fm(args, analogueIncorporation); - //compute trim factor - unsigned int trimFactor = 1; - unsigned int failed = 0; - for (auto p = pooling.begin(); p < pooling.end(); p++) trimFactor *= *p; + int failed = 0; std::vector< DetectedRead > readBuffer; int progress = 0; @@ -858,15 +1210,14 @@ int sense_main( int argc, char** argv ){ //check the read length on the back of the buffer if (readBuffer.size() > 0){ - bool longEnough = checkReadLength( readBuffer.back().positions.size() ); - if (not longEnough){ + if (readBuffer.back().positions.size() < 2000){ failed++; readBuffer.pop_back(); } } //empty the buffer if it's full - if (readBuffer.size() >= maxBufferSize) emptyBuffer(readBuffer, args, session, outFile, originFile, termFile, leftForkFile, rightForkFile, trimFactor); + if (readBuffer.size() >= maxBufferSize) emptyBuffer(readBuffer, args, fm, analogueIncorporation); progress++; pb.displayProgress( progress, failed, 0 ); @@ -890,12 +1241,16 @@ int sense_main( int argc, char** argv ){ assert(d.mappingUpper > d.mappingLower); readBuffer.push_back(d); } + else if ( line.substr(0,1) == "%" ){ + + continue; //from the old version where we included a cigar string - take this out in a later version + } else{ std::string column; std::stringstream ssLine(line); int position = -1, cIndex = 0; - AnalogueScore B, BM; + AnalogueScore B, E; while ( std::getline( ssLine, column, '\t' ) ){ if ( cIndex == 0 ){ @@ -904,6 +1259,10 @@ int sense_main( int argc, char** argv ){ } else if ( cIndex == 1 ){ + E.set(std::stof(column)); + } + else if ( cIndex == 2 ){ + B.set(std::stof(column)); } cIndex++; @@ -911,24 +1270,18 @@ int sense_main( int argc, char** argv ){ readBuffer.back().positions.push_back(position); readBuffer.back().brduCalls.push_back(B.get()); + readBuffer.back().eduCalls.push_back(E.get()); } } //empty the buffer at the end - if (readBuffer.size() > 0){ - - bool longEnough = checkReadLength( readBuffer.back().positions.size() ); - if (not longEnough){ - failed++; - readBuffer.pop_back(); - } + if (readBuffer.back().positions.size() < 2000){ + readBuffer.pop_back(); } - emptyBuffer(readBuffer, args, session, outFile, originFile, termFile, leftForkFile, rightForkFile, trimFactor); - pb.displayProgress( progress, failed, 0 ); + emptyBuffer(readBuffer, args, fm, analogueIncorporation); + inFile.close(); - outFile.close(); - if (args.markTerms) termFile.close(); - if (args.markOrigins) originFile.close(); + fm.closeAll(); std::cout << std::endl << "Done." << std::endl; diff --git a/src/forkSense.h b/src/forkSense.h index 96e12d6..f4fa18c 100755 --- a/src/forkSense.h +++ b/src/forkSense.h @@ -1,5 +1,5 @@ //---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford +// Copyright 2020 University of Cambridge // Written by Michael A. Boemo (mb915@cam.ac.uk) // This software is licensed under GPL-3.0. You should have // received a copy of the license with this software. If @@ -11,37 +11,156 @@ #include #include +#include +#include #include "error_handling.h" -/*function prototypes */ int sense_main( int argc, char** argv ); +struct ReadSegment{ + int leftmostCoord = 0; + int leftmostIdx = 0; + int rightmostCoord = 0; + int rightmostIdx = 0; + int partners = 0; +}; + + +struct KMeansResult{ + double centroid_1; + double centroid_1_stdv; + double centroid_2; + double centroid_2_stdv; +}; + +struct forkSenseArgs { + + std::string detectFilename; + std::string outputFilename; + std::string analogueOrder; + bool markOrigins = false; + bool markTerms = false; + bool markForks = false; + bool markAnalogues = false; + unsigned int threads = 1; +}; + +class AnalogueScore{ + + private: + double _score = 0.0; + bool _isSet = false; + public: + void set(double s){ + _score = s; + _isSet = true; + } + double get(void){ + assert(_isSet); + return _score; + } +}; + class DetectedRead{ public: - std::vector< unsigned int > positions; - std::vector< double > brduCalls; + std::vector< int > positions; + std::vector< double > brduCalls, eduCalls; std::string readID, chromosome, strand, header; int mappingLower, mappingUpper; - std::vector> probabilities; - std::vector> origins; - std::vector> terminations; - std::vector tensorInput; - void trim(unsigned int trimFactor){ + std::vector< int > EdU_segment_label, BrdU_segment_label, thymidine_segment_label, contested_segment_label; + std::vector EdU_segment, BrdU_segment; + std::vector origins, terminations, leftForks, rightForks; + std::vector tensorInput; + int64_t inputSize; +}; + + +std::string writeForkSenseHeader(forkSenseArgs &, KMeansResult ); +std::string writeBedHeader( forkSenseArgs & ); + +class fs_fileManager{ + + protected: + + forkSenseArgs inputArgs; + std::ofstream outFile, originFile, termFile, leftForkFile, rightForkFile, EdUFile, BrdUFile; + + public: + + fs_fileManager( forkSenseArgs &args , KMeansResult analogueIncorporation){ + + inputArgs = args; + + //main output file + outFile.open( args.outputFilename ); + if ( not outFile.is_open() ) throw IOerror( args.outputFilename ); + std::string outHeader = writeForkSenseHeader(args, analogueIncorporation); + outFile << outHeader; - assert(positions.size() > trimFactor and brduCalls.size() > trimFactor and positions.size() == brduCalls.size()); - unsigned int cropFromEnd = positions.size() % trimFactor; - brduCalls.erase(brduCalls.end() - cropFromEnd, brduCalls.end()); - positions.erase(positions.end() - cropFromEnd, positions.end()); - assert(positions.size() % trimFactor == 0 and brduCalls.size() % trimFactor == 0); - } - void generateInput(void){ + //aux bed files + if (args.markTerms){ - for (size_t i = 0; i < positions.size(); i++){ - tensorInput.push_back(brduCalls[i]); + termFile.open("terminations_DNAscent_forkSense.bed"); + termFile << writeBedHeader(args); + if ( not termFile.is_open() ) throw IOerror( "terminations_DNAscent_forkSense.bed" ); + } + if (args.markOrigins){ + + originFile.open("origins_DNAscent_forkSense.bed"); + originFile << writeBedHeader(args); + if ( not originFile.is_open() ) throw IOerror( "origins_DNAscent_forkSense.bed" ); + } + if (args.markForks){ + + leftForkFile.open("leftForks_DNAscent_forkSense.bed"); + leftForkFile << writeBedHeader(args); + if ( not leftForkFile.is_open() ) throw IOerror( "leftForks_DNAscent_forkSense.bed" ); + + rightForkFile.open("rightForks_DNAscent_forkSense.bed"); + rightForkFile << writeBedHeader(args); + if ( not rightForkFile.is_open() ) throw IOerror( "rightForks_DNAscent_forkSense.bed" ); + } + if (args.markAnalogues){ + + BrdUFile.open("BrdU_DNAscent_forkSense.bed"); + BrdUFile << writeBedHeader(args); + if ( not BrdUFile.is_open() ) throw IOerror( "BrdU_DNAscent_forkSense.bed" ); + + EdUFile.open("EdU_DNAscent_forkSense.bed"); + EdUFile << writeBedHeader(args); + if ( not EdUFile.is_open() ) throw IOerror( "EdU_DNAscent_forkSense.bed" ); + } + } + void writeOutput(std::string &readOutput, + std::string &termOutput, + std::string &originOutput, + std::string &leftForkOutput, + std::string &rightForkOutput, + std::string &BrdUOutput, + std::string &EdUOutput ){ + + outFile << readOutput; + if (inputArgs.markTerms and termOutput.size() > 0) termFile << termOutput; + if (inputArgs.markOrigins and originOutput.size() > 0) originFile << originOutput; + if (inputArgs.markForks and leftForkOutput.size() > 0) leftForkFile << leftForkOutput; + if (inputArgs.markForks and rightForkOutput.size() > 0) rightForkFile << rightForkOutput; + if (inputArgs.markAnalogues and BrdUOutput.size() > 0) BrdUFile << BrdUOutput; + if (inputArgs.markAnalogues and EdUOutput.size() > 0) EdUFile << EdUOutput; + } + void closeAll(){ + outFile.close(); + if (inputArgs.markTerms) termFile.close(); + if (inputArgs.markOrigins) originFile.close(); + if (inputArgs.markAnalogues){ + BrdUFile.close(); + EdUFile.close(); } } }; +KMeansResult twoMeans_fs( std::vector< double > & ); +std::pair segmentationTrim(std::vector< int > &, std::vector< double > &, std::vector< double > &, int , int ); + #endif diff --git a/src/htsInterface.cpp b/src/htsInterface.cpp index 4b14bbd..6ee8f9e 100644 --- a/src/htsInterface.cpp +++ b/src/htsInterface.cpp @@ -1,5 +1,5 @@ //---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford +// Copyright 2020 University of Cambridge // Written by Michael A. Boemo (mb915@cam.ac.uk) // This software is licensed under GPL-3.0. You should have // received a copy of the license with this software. If diff --git a/src/htsInterface.h b/src/htsInterface.h index 083cd8d..4ba1b4e 100644 --- a/src/htsInterface.h +++ b/src/htsInterface.h @@ -1,5 +1,5 @@ //---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford +// Copyright 2020 University of Cambridge // Written by Michael A. Boemo (mb915@cam.ac.uk) // This software is licensed under GPL-3.0. You should have // received a copy of the license with this software. If diff --git a/src/index.cpp b/src/index.cpp index 08759a6..3d2fc6b 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -20,7 +20,7 @@ static const char *help= "index: DNAscent executable that builds an index file for DNAscent detect.\n" "To run DNAscent index, do:\n" -" DNAscent index -f /path/to/fast5Directory\n" +" DNAscent index -f /path/to/fast5Directory -s /path/to/sequencing_summary.txt\n" "Required arguments are:\n" " -f,--files full path to fast5 files,\n" " -s,--sequencing-summary path to sequencing summary file Guppy.\n" diff --git a/src/index.h b/src/index.h index dab193a..36f9d0e 100644 --- a/src/index.h +++ b/src/index.h @@ -1,5 +1,5 @@ //---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford +// Copyright 2019 University of Oxford // Written by Michael A. Boemo (mb915@cam.ac.uk) // This software is licensed under GPL-3.0. You should have // received a copy of the license with this software. If diff --git a/src/poreModels.h b/src/poreModels.h index 6612f45..0186509 100755 --- a/src/poreModels.h +++ b/src/poreModels.h @@ -1,5 +1,5 @@ //---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford +// Copyright 2019 University of Oxford // Written by Michael A. Boemo (mb915@cam.ac.uk) // This software is licensed under GPL-3.0. You should have // received a copy of the license with this software. If diff --git a/src/probability.cpp b/src/probability.cpp index 90a1b17..5397baf 100644 --- a/src/probability.cpp +++ b/src/probability.cpp @@ -1,5 +1,5 @@ //---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford +// Copyright 2019 University of Oxford // Written by Michael A. Boemo (mb915@cam.ac.uk) // This software is licensed under GPL-3.0. You should have // received a copy of the license with this software. If diff --git a/src/probability.h b/src/probability.h index 2090a23..fd1292c 100644 --- a/src/probability.h +++ b/src/probability.h @@ -1,5 +1,5 @@ //---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford +// Copyright 2019 University of Oxford // Written by Michael A. Boemo (mb915@cam.ac.uk) // This software is licensed under GPL-3.0. You should have // received a copy of the license with this software. If diff --git a/src/psl.cpp b/src/psl.cpp deleted file mode 100644 index 5f8fed2..0000000 --- a/src/psl.cpp +++ /dev/null @@ -1,235 +0,0 @@ -//---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford -// Written by Michael A. Boemo (mb915@cam.ac.uk) -// This software is licensed under GPL-3.0. You should have -// received a copy of the license with this software. If -// not, please Email the author. -//---------------------------------------------------------- - -#include -#include "psl.h" -#include "common.h" -#include "data_IO.h" -#include "error_handling.h" -#include -#define _USE_MATH_DEFINES - - static const char *help= -"psl: DNAscent executable that builds a PSL file from the output of DNAscent detect.\n" -"To run DNAscent psl, do:\n" -" DNAscent psl -d /path/to/DNAscentOutput.detect -r /path/to/reference.fasta -o /path/to/psl_prefix\n" -"Required arguments are:\n" -" -d,--detect path to output file from DNAscent detect,\n" -" -r,--reference path to genome reference in fasta format,\n" -" -o,--output path to output bed prefix.\n" -"Optional arguments are:\n" -" --threshold probability above which a BrdU call is considered positive (default: 0.8),\n" -" --min minimum read length to compute (default is 1),\n" -" --max maximum read length to compute (default is Inf).\n" -"Written by Michael Boemo, Department of Pathology, University of Cambridge.\n" -"Please submit bug reports to GitHub Issues (https://github.com/MBoemo/DNAscent/issues)."; - - -struct Arguments { - std::string detectFilename; - std::string outputFilename; - std::string referenceFilename; - bool cropToMin = false; - unsigned int min = 0; - bool cropToMax = false; - unsigned int max = 0; - double threshold; -}; - -Arguments parsePslArguments( int argc, char** argv ){ - if( argc < 2 ){ - std::cout << "Exiting with error. Insufficient arguments passed to DNAscent psl." << std::endl << help << std::endl; - exit(EXIT_FAILURE); - } - if ( std::string( argv[ 1 ] ) == "-h" or std::string( argv[ 1 ] ) == "--help" ){ - std::cout << help << std::endl; - exit(EXIT_SUCCESS); - } - else if( argc < 4 ){ - std::cout << "Exiting with error. Insufficient arguments passed to DNAscent psl." << std::endl; - exit(EXIT_FAILURE); - } - Arguments args; - args.threshold = 0.8; - - /*parse the command line arguments */ - for ( int i = 1; i < argc; ){ - std::string flag( argv[ i ] ); - if ( flag == "-d" or flag == "--detect" ){ - std::string strArg( argv[ i + 1 ] ); - args.detectFilename = strArg; - i+=2; - } - else if ( flag == "-o" or flag == "--output" ){ - std::string strArg( argv[ i + 1 ] ); - args.outputFilename = strArg + ".psl"; - i+=2; - } - else if ( flag == "--min" ){ - args.cropToMin = true; - std::string strArg( argv[ i + 1 ] ); - args.min = std::stoi( strArg.c_str() ); - i+=2; - } - else if ( flag == "--max" ){ - args.cropToMax = true; - std::string strArg( argv[ i + 1 ] ); - args.max = std::stoi( strArg.c_str() ); - i+=2; - } - else if ( flag == "-r" or flag == "--reference" ){ - std::string strArg( argv[ i + 1 ] ); - args.referenceFilename = strArg; - i+=2; - } - else if ( flag == "-l" or flag == "--threshold" ){ - std::string strArg( argv[ i + 1 ] ); - args.threshold = std::stof( strArg.c_str() ); - i+=2; - } - else throw InvalidOption( flag ); - } - if (args.outputFilename == args.referenceFilename or args.outputFilename == args.detectFilename) throw OverwriteFailure(); - - return args; -} - - void writePSL( readDetection &rd, std::map< std::string, std::string > &reference, std::ofstream &outFile ){ - if (rd.positions.size() == 0) return; - outFile << 0 << " "; //matches - outFile << 0 << " "; //mismatches - outFile << 0 << " "; //repMatches - outFile << 0 << " "; //nCount - outFile << 0 << " "; //qNumInsert - outFile << 0 << " "; //qBaseInsert - outFile << 0 << " "; //tNumInsert - outFile << 0 << " "; //tBaseInsert - outFile << rd.direction << " "; //strand - outFile << rd.readID << " "; //queryName - outFile << rd.mappingUpper - rd.mappingLower << " "; //qSize - outFile << 0 << " "; //qStart - outFile << rd.mappingUpper - rd.mappingLower << " "; //qEnd - outFile << rd.chromosome << " "; //tName - outFile << reference[rd.chromosome].size() << " "; //tSize - outFile << rd.mappingLower << " "; //tStart - outFile << rd.mappingUpper << " "; //tEnd - outFile << rd.positions.size() + 2 << " "; //blockCount - //blockSizes - outFile << 1 << ","; //extra for start - for ( unsigned int i = 0; i < rd.positions.size(); i++ ){ - outFile << 1 << ","; - } - outFile << 1 << ","; //extra for end - outFile << " "; - //qStarts - outFile << 0 << ","; //extra for start - for ( unsigned int i = 0; i < rd.positions.size(); i++ ){ - outFile << rd.positions[i] - rd.mappingLower << ","; - } - outFile << rd.mappingUpper - rd.mappingLower << ","; //extra for end - outFile << " "; - //tStarts - outFile << rd.mappingLower << ","; //extra for start - for ( unsigned int i = 0; i < rd.positions.size(); i++ ){ - outFile << rd.positions[i] << ","; - } - outFile << rd.mappingUpper << ","; //extra for end - outFile << std::endl; -} - int psl_main( int argc, char** argv ){ - - Arguments args = parsePslArguments( argc, argv ); - std::map< std::string, std::string > reference = import_reference_pfasta(args.referenceFilename); - - std::ifstream inFile( args.detectFilename ); - if ( not inFile.is_open() ) throw IOerror( args.detectFilename ); - std::ofstream outFile( args.outputFilename ); - if ( not outFile.is_open() ) throw IOerror( args.outputFilename ); - - std::string line; - std::vector< readDetection > buffer; - bool recordRead = true; - - while ( std::getline( inFile, line ) ){ - - if (line.substr(0,1) == "#") continue; //ignore header - if ( line.substr(0,1) == ">" ){ - - if ( buffer.size() >= 10 ){ - - for ( unsigned int i = 0; i < buffer.size(); i++ ){ - - writePSL( buffer[i], reference, outFile ); - } - buffer.clear(); - } - readDetection rd; - - std::stringstream ssLine(line); - std::string column; - int cIndex = 0; - while ( std::getline( ssLine, column, ' ' ) ){ - - if ( cIndex == 0 ) rd.readID = column; - else if ( cIndex == 1 ) rd.chromosome = column; - else if ( cIndex == 2 ) rd.mappingLower = std::stoi(column); - else if ( cIndex == 3 ) rd.mappingUpper = std::stoi(column); - else if ( cIndex == 4 ) rd.strand = column; - else throw DetectParsing(); - cIndex++; - } - assert(rd.mappingUpper > rd.mappingLower); - unsigned int readLength = rd.mappingUpper - rd.mappingLower; - recordRead = true; - if ( (args.cropToMin and readLength < args.min) or (args.cropToMax and readLength > args.max) ){ - - recordRead = false; - continue; - } - - if (rd.strand == "fwd") rd.direction = "+"; - else if (rd.strand == "rev") rd.direction = "-"; - else throw BadStrandDirection(); - buffer.push_back(rd); - } - else if ( recordRead ){ - - std::string column; - std::stringstream ssLine(line); - int position = 0, cIndex = 0; - double B; - - while( std::getline( ssLine, column, '\t' ) ){ - - if (cIndex == 0){ - - position = std::stoi(column); - } - else if (cIndex == 1){ - - B = std::stof(column); - - if ( B > args.threshold ){ - - buffer.back().positions.push_back(position); - } - break; - } - cIndex++; - } - } - } - - for ( unsigned int i = 0; i < buffer.size(); i++ ){ - - writePSL( buffer[i], reference, outFile ); - } - buffer.clear(); - - return 0; -} diff --git a/src/psl.h b/src/psl.h deleted file mode 100644 index 1816e51..0000000 --- a/src/psl.h +++ /dev/null @@ -1,33 +0,0 @@ -//---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford -// Written by Michael A. Boemo (mb915@cam.ac.uk) -// This software is licensed under GPL-3.0. You should have -// received a copy of the license with this software. If -// not, please Email the author. -//---------------------------------------------------------- - -#ifndef PSL_H -#define PSL_H - -#include - -struct Track{ - - int lowerBound, upperBound; -}; - -struct readDetection{ - - std::vector< int > positions; - std::vector< double > BrdUProb; - std::string readID, chromosome; - int mappingLower, mappingUpper; - std::vector< Track > tracks; - std::string direction, strand; -}; - - -/*function prototypes */ -int psl_main( int argc, char** argv ); - -#endif diff --git a/src/regions.cpp b/src/regions.cpp deleted file mode 100755 index 038d9a2..0000000 --- a/src/regions.cpp +++ /dev/null @@ -1,539 +0,0 @@ -//---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford -// Written by Michael A. Boemo (mb915@cam.ac.uk) -// This software is licensed under GPL-3.0. You should have -// received a copy of the license with this software. If -// not, please Email the author. -//---------------------------------------------------------- - -#define TEST_CLUSTERING 0 -//#define TEST_COOLDOWN 1 - -#include -#include "regions.h" -#include "common.h" -#include "data_IO.h" -#include "error_handling.h" -#include "trainGMM.h" -#include -#include -#include -#define _USE_MATH_DEFINES - - -static const char *help= -"regions: DNAscent executable that finds regions of analogue incorporation from the output of DNAscent detect.\n" -"To run DNAscent regions, do:\n" -" DNAscent regions -d /path/to/output.detect -o /path/to/output.regions\n" -"Required arguments are:\n" -" -d,--detect path to output file from DNAscent detect,\n" -" -o,--output path to output directory for bedgraph files.\n" -"Optional arguments (if used with HMM-based detect) are:\n" -" --threshold probability above which a BrdU call is considered positive (default: 0.8),\n" -" -c,--cooldown minimum gap between positive analogue calls (default: 4),\n" -" -r,--resolution number of thymidines in a region (default is 100 bp),\n" -" -p,--probability override probability that a thymidine 6mer contains a BrdU (default: automatically calculated),\n" -" -z,--zScore override zScore threshold for BrdU call (default: automatically calculated).\n" -"Written by Michael Boemo, Department of Pathology, University of Cambridge.\n" -"Please submit bug reports to GitHub Issues (https://github.com/MBoemo/DNAscent/issues)."; - - struct Arguments { - - std::string detectFilename; - double probability, threshold, likelihood; - bool overrideProb,overrideZ,overrideResolution; - unsigned int resolution; - int cooldown; - std::string outputFilename; - bool callReplication; -}; - -Arguments parseRegionsArguments( int argc, char** argv ){ - - if( argc < 2 ){ - std::cout << "Exiting with error. Insufficient arguments passed to DNAscent regions." << std::endl << help << std::endl; - exit(EXIT_FAILURE); - } - if ( std::string( argv[ 1 ] ) == "-h" or std::string( argv[ 1 ] ) == "--help" ){ - std::cout << help << std::endl; - exit(EXIT_SUCCESS); - } - else if( argc < 4 ){ - std::cout << "Exiting with error. Insufficient arguments passed to DNAscent regions." << std::endl; - exit(EXIT_FAILURE); - } - - Arguments args; - - //defaults - we'll override these if the option was specified by the user - args.resolution = 100; - args.threshold = 0; - args.overrideProb = false; - args.overrideZ = false; - args.likelihood = 0.8; - args.cooldown = 4; - args.overrideResolution = false; - - /*parse the command line arguments */ - for ( int i = 1; i < argc; ){ - - std::string flag( argv[ i ] ); - - if ( flag == "-d" or flag == "--detect" ){ - std::string strArg( argv[ i + 1 ] ); - args.detectFilename = strArg; - i+=2; - } - else if ( flag == "-p" or flag == "--probability" ){ - std::string strArg( argv[ i + 1 ] ); - args.probability = std::stof( strArg.c_str() ); - args.overrideProb = true; - i+=2; - } - else if ( flag == "-o" or flag == "--output" ){ - std::string strArg( argv[ i + 1 ] ); - args.outputFilename = strArg; - i+=2; - } - else if ( flag == "-z" or flag == "--zScore" ){ - std::string strArg( argv[ i + 1 ] ); - args.threshold = std::stof(strArg.c_str()); - args.overrideZ = true; - i+=2; - } - else if ( flag == "--replication" ){ - - args.callReplication = true; - i+=1; - } - else if ( flag == "-c" or flag == "--cooldown" ){ - std::string strArg( argv[ i + 1 ] ); - args.cooldown = std::stoi( strArg.c_str() ); - i+=2; - } - else if ( flag == "--threshold" ){ - std::string strArg( argv[ i + 1 ] ); - args.likelihood = std::stof( strArg.c_str() ); - i+=2; - } - else if ( flag == "-r" or flag == "--resolution" ){ - std::string strArg( argv[ i + 1 ] ); - args.resolution = std::stoi( strArg.c_str() ); - args.overrideResolution = true; - i+=2; - } - else throw InvalidOption( flag ); - } - if (args.outputFilename == args.detectFilename) throw OverwriteFailure(); - - return args; -} - - -struct region{ - - std::string call=""; - int start, end; - double score; - std::string forkDir=""; -}; - - -std::pair< double, double > twoMeans( std::vector< double > &observations ){ - - double C1_old = 0.01; - double C2_old = 0.5; - double C1_new = C1_old; - double C2_new = C2_old; - double tol = 0.0001; - int maxIter = 100; - int iter = 0; - - std::vector C1_points_old; - std::vector C2_points_old; - - //make an initial assignment - for ( size_t i = 0; i < observations.size(); i++ ){ - - if ( std::abs(observations[i] - C1_old) < std::abs(observations[i] - C2_old) ) C1_points_old.push_back(observations[i]); - else C2_points_old.push_back(observations[i]); - } - - //iterate until tolerance is met - do{ - C1_old = C1_new; - C2_old = C2_new; - - std::vector C1_points_new; - std::vector C2_points_new; - - for ( size_t i = 0; i < C1_points_old.size(); i++ ){ - - if ( std::abs(C1_points_old[i] - C1_old) < std::abs(C1_points_old[i] - C2_old) ) C1_points_new.push_back(C1_points_old[i]); - else C2_points_new.push_back(C1_points_old[i]); - } - - for ( size_t i = 0; i < C2_points_old.size(); i++ ){ - - if ( std::abs(C2_points_old[i] - C1_old) < std::abs(C2_points_old[i] - C2_old) ) C1_points_new.push_back(C2_points_old[i]); - else C2_points_new.push_back(C2_points_old[i]); - } - - //guard against either C1 or C2 being empty - if (C1_points_new.size() == 0){ - - C2_new = vectorMean(C2_points_new); - return std::make_pair(0,C2_new); - } - else if (C2_points_new.size() == 0){ - - C1_new = vectorMean(C1_points_new); - return std::make_pair(C1_new,0); - } - - C1_new = vectorMean(C1_points_new); - C2_new = vectorMean(C2_points_new); - - C1_points_old = C1_points_new; - C2_points_old = C2_points_new; - - iter++; - }while (iter < maxIter and (std::abs(C1_old - C1_new)>tol or std::abs(C2_old - C2_new)>tol)); - -#if TEST_CLUSTERING - std::cerr << ">" << C1_new << std::endl; - for (auto c = C1_points_old.begin(); c < C1_points_old.end(); c++) std::cerr << *c << std::endl; - std::cerr << ">" << C2_new << std::endl; - for (auto c = C2_points_old.begin(); c < C2_points_old.end(); c++) std::cerr << *c << std::endl; -#endif - - return std::make_pair(C1_new,C2_new); -} - - -int parseDetectLine_CNN(std::string line, - double callThreshold, - unsigned int cooldownThreshold, - unsigned int &attemptCooldown, - unsigned int &callCooldown, - unsigned int &calls, - unsigned int &attempts){ - - std::string column; - std::stringstream ssLine(line); - int position = -1, cIndex = 0; - AnalogueScore B; - while ( std::getline( ssLine, column, '\t' ) ){ - - if ( cIndex == 0 ){ - - position = std::stoi(column); - } - else if ( cIndex == 1 ){ - - B.set(std::stof(column)); - } - cIndex++; - } - assert(position != -1); - - - if ( B.get() > callThreshold and position - callCooldown >= cooldownThreshold ){ - attemptCooldown = position; - callCooldown = position; - calls++; - attempts++; - } - else if (position - attemptCooldown >= cooldownThreshold){ - attempts++; - attemptCooldown = position; - } - return position; -} - - -std::string getStrand(std::string line){ - - std::stringstream ssLine(line); - int cIndex = 0; - std::string strand = ""; - std::string column; - int countCol = std::count(line.begin(), line.end(), ' '); - assert(countCol == 4); - while ( std::getline( ssLine, column, ' ' ) ){ - - if ( cIndex == 4 ){ - - strand = column; - } - cIndex++; - } - assert(strand != ""); - - return strand; -} - - -void regionsCNN(Arguments args){ - - //get a read count - int readCount = 0; - std::string line; - std::ifstream inFile( args.detectFilename ); - if ( not inFile.is_open() ) throw IOerror( args.detectFilename ); - while( std::getline( inFile, line ) ){ - - if ( line.substr(0,1) == ">" ) readCount++; - } - progressBar pb(readCount,false); - inFile.close(); - - //estimate the fraction of BrdU incorporation - double p; - std::string header; - unsigned int calls = 0, attempts = 0, gap = 0; - - int startingPos = -1; - int progress = 0; - unsigned int callCooldown = 0; - unsigned int attemptCooldown = 0; - std::string strand; - - if ( not args.overrideProb ){ - - inFile.open( args.detectFilename ); - if ( not inFile.is_open() ) throw IOerror( args.detectFilename ); - - std::cout << "Estimating analogue incorporation..." << std::endl; - - std::vector< double > callFractions; - while( std::getline( inFile, line ) ){ - - if (line.substr(0,1) == "#") continue; //ignore header - if ( line.substr(0,1) == ">" ){ - - strand = getStrand(line); - progress++; - pb.displayProgress( progress, 0, 0 ); - callCooldown = 0; - attemptCooldown = 0; - calls = 0, attempts = 0, gap = 0, startingPos = -1; - continue; - } - - int position = parseDetectLine_CNN(line, args.likelihood, args.cooldown, attemptCooldown, callCooldown, calls, attempts); - - if (position == -1) continue; - - if ( startingPos == -1 ) startingPos = position; - gap = position - startingPos; - - if ( gap > args.resolution and attempts >= args.resolution / 10 ){ - - double frac = (double) calls / (double) attempts; - callFractions.push_back( frac ); - calls = 0, attempts = 0, gap = 0, startingPos = -1; - } - } - - std::cout << std::endl << "Done." << std::endl; - double k1,k2; - std::tie(k1,k2) = twoMeans( callFractions ); - p = std::max(k1,k2); - -#if !TEST_CLUSTERING - std::cerr << "Estimated fraction of analogue substitution in analogue-positive regions: " << p << std::endl; -#endif - inFile.close(); - } - else p = args.probability; - - if ( not args.overrideZ ){ - - //estimate appropriate z-score threshold - std::cout << "Setting Z-score threshold..." << std::endl; - inFile.open( args.detectFilename ); - if ( not inFile.is_open() ) throw IOerror( args.detectFilename ); - progressBar pb_z(readCount,false); - calls = 0; attempts = 0; gap = 0; - startingPos = -1; - progress = 0; - callCooldown = 0; attemptCooldown = 0; - std::vector allZScores; - while( std::getline( inFile, line ) ){ - - if (line.substr(0,1) == "#") continue; //ignore header - if ( line.substr(0,1) == ">" ){ - - strand = getStrand(line); - progress++; - pb_z.displayProgress( progress, 0, 0 ); - calls = 0, attempts = 0, gap = 0, startingPos = -1; - callCooldown = 0; - attemptCooldown = 0; - } - else{ - - int position = parseDetectLine_CNN(line, args.likelihood, args.cooldown, attemptCooldown, callCooldown, calls, attempts); - - if (position == -1) continue; - if ( startingPos == -1 ) startingPos = position; - gap = position - startingPos; - - if ( gap > args.resolution and attempts >= args.resolution / 10 ){ - - double score = (calls - attempts * p) / sqrt( attempts * p * ( 1 - p) ); - allZScores.push_back(score); - calls = 0, attempts = 0, gap = 0, startingPos = -1; - } - } - } - inFile.close(); - std::cout << "Done." << std::endl; - - std::vector< double > fitParams = gaussianMixtureEM(0.5, -7.0, 1.0, 0., 1.0, allZScores, 0.01, 100 ); - double thym_mu, thym_mix, thym_sigma, brdu_mu, brdu_mix, brdu_sigma; - if (fitParams[1] < fitParams[2]){ - - thym_mix = fitParams[0]; - thym_mu = fitParams[1]; - thym_sigma = fitParams[2]; - brdu_mix = fitParams[3]; - brdu_mu = fitParams[4]; - brdu_sigma = fitParams[5]; - } - else{ - - thym_mix = fitParams[3]; - thym_mu = fitParams[4]; - thym_sigma = fitParams[5]; - brdu_mix = fitParams[0]; - brdu_mu = fitParams[1]; - brdu_sigma = fitParams[2]; - } -#if !TEST_CLUSTERING - std::cerr << "Estimated fraction of thymidine regions: " << thym_mix << std::endl; - std::cerr << "Estimated fraction of BrdU regions: " << brdu_mix << std::endl; - std::cerr << "Thymidine Z-score mean, stdv: " << thym_mu << " " << thym_sigma << std::endl; - std::cerr << "BrdU Z-score mean, stdv: " << brdu_mu << " " << brdu_sigma << std::endl; -#endif - -#if !TEST_CLUSTERING - std::cerr << "Set Z-score threshold: " << brdu_mu << std::endl; -#endif - args.threshold = brdu_mu; - } - - //call regions - inFile.open( args.detectFilename ); - if ( not inFile.is_open() ) throw IOerror( args.detectFilename ); - std::ofstream outFile( args.outputFilename ); - if ( not outFile.is_open() ) throw IOerror( args.outputFilename ); - - //write the regions header - outFile << writeRegionsHeader(args.detectFilename, args.likelihood, true, args.cooldown, args.resolution, p, args.threshold); - - std::ofstream repFile; - if ( args.callReplication ){ - - repFile.open("calledOrigins.dnascent"); - } - std::cout << "Calling regions..." << std::endl; - std::vector< region > buffer; - calls = 0; attempts = 0; gap = 0; - startingPos = -1; - progress = 0; - callCooldown = 0; attemptCooldown = 0; - bool first = true; - while( std::getline( inFile, line ) ){ - - if (line.substr(0,1) == "#") continue; //ignore header - if ( line.substr(0,1) == ">" ){ - - strand = getStrand(line); - progress++; - pb.displayProgress( progress, 0, 0 ); - - if (not first){ - - outFile << header << std::endl; - - for ( auto r = buffer.begin(); r < buffer.end(); r++ ){ - - outFile << r -> start << "\t" << r -> end << "\t" << r -> score << "\t" << r -> call << "\t" << r -> forkDir << std::endl; - } - } - header = line; - buffer.clear(); - calls = 0, attempts = 0, gap = 0, startingPos = -1; - callCooldown = 0; - attemptCooldown = 0; - first = false; - - } - else{ - - int position = parseDetectLine_CNN(line, args.likelihood, args.cooldown, attemptCooldown, callCooldown, calls, attempts); - - if (position == -1) continue; - - if ( startingPos == -1 ) startingPos = position; - gap = position - startingPos; - - if ( gap > args.resolution and attempts >= args.resolution / 10 ){ - - region r; - - r.score = (calls - attempts * p) / sqrt( attempts * p * ( 1 - p) ); - - if ( r.score > args.threshold ) r.call = "BrdU"; - else r.call = "Thym"; - - r.score += fabs(args.threshold); - - r.start = startingPos; - r.end = position; - - buffer.push_back(r); - calls = 0, attempts = 0, gap = 0, startingPos = -1; - } - } - } - - //empty the buffer at the end - if ( buffer.size() > 5 ){ - - outFile << header << std::endl; - - for ( auto r = buffer.begin(); r < buffer.end(); r++ ){ - - outFile << r -> start << "\t" << r -> end << "\t" << r -> score << "\t" << r -> call << "\t" << r -> forkDir << std::endl; - } - } - - if ( repFile.is_open() ) repFile.close(); - inFile.close(); - outFile.close(); - std::cout << std::endl << "Done." << std::endl; - - if (p < 0.1){ - std::cerr << "WARNING: Analogue incorporation is estimated to be low: " << p << std::endl; - std::cerr << " Samples may not have analogue in them; DNAscent regions assumes there are both analogue-positive and analogue-negative regions in the sample." << std::endl; - std::cerr << " The DNAscent regions results may be unreliable. See https://dnascent.readthedocs.io/en/latest/regions.html for details." << std::endl; - } - else if (p > 0.7){ - std::cerr << "WARNING: Analogue incorporation is estimated to be high: " << p << std::endl; - std::cerr << " Samples may be saturated; DNAscent regions assumes there are both analogue-positive and analogue-negative regions in the sample." << std::endl; - std::cerr << " The DNAscent regions results may be unreliable. See https://dnascent.readthedocs.io/en/latest/regions.html for details." << std::endl; - } -} - - -int regions_main( int argc, char** argv ){ - - Arguments args = parseRegionsArguments( argc, argv ); - - regionsCNN(args); - - return 0; -} diff --git a/src/regions.h b/src/regions.h deleted file mode 100755 index 11d4b69..0000000 --- a/src/regions.h +++ /dev/null @@ -1,66 +0,0 @@ -//---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford -// Written by Michael A. Boemo (mb915@cam.ac.uk) -// This software is licensed under GPL-3.0. You should have -// received a copy of the license with this software. If -// not, please Email the author. -//---------------------------------------------------------- - -#ifndef REGIONS_H -#define REGIONS_H - -#include - -/*function prototypes */ -int regions_main( int argc, char** argv ); - - -class AnalogueScore{ - - private: - double _score = 0.0; - bool _isSet = false; - public: - void set(double s){ - _score = s; - _isSet = true; - } - double get(void){ - assert(_isSet); - return _score; - } -}; - -class Line{ - - private: - AnalogueScore BrdU; - AnalogueScore Methyl; - AnalogueScore BrdUvMethyl; - unsigned int position; - - public: - Line(unsigned int position){ - this -> position = position; - } - void setBrdUScore(double s){ - BrdU.set(s); - } - void setBrdUvMethylScore(double s){ - BrdUvMethyl.set(s); - } - void setMethylScore(double s){ - Methyl.set(s); - } - double getBrdUScore(void){ - return BrdU.get(); - } - double getBrdUvMethylScore(void){ - return BrdUvMethyl.get(); - } - double getMethylScore(void){ - return Methyl.get(); - } -}; - -#endif diff --git a/src/tensor.cpp b/src/tensor.cpp index 5806916..8640874 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -1,5 +1,5 @@ //---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford +// Copyright 2020 University of Cambridge // Written by Michael A. Boemo (mb915@cam.ac.uk) // This software is licensed under GPL-3.0. You should have // received a copy of the license with this software. If @@ -7,85 +7,50 @@ //---------------------------------------------------------- #include "tensor.h" -//#include "../tensorflow/include/tensorflow/c/c_api.h" #include -//start: adapted from https://github.com/aljabr0/from-keras-to-c -//licensed under Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -static TF_Buffer* read_tf_buffer_from_file(const char* file) { - std::ifstream t(file, std::ifstream::binary); - t.exceptions(std::ifstream::failbit | std::ifstream::badbit); - t.seekg(0, std::ios::end); - size_t size = t.tellg(); - auto data = std::make_unique(size); - t.seekg(0); - t.read(data.get(), size); - - TF_Buffer *buf = TF_NewBuffer(); - buf->data = data.release(); - buf->length = size; - buf->data_deallocator = free_cpp_array; - return buf; -} -//end adapted from https://github.com/aljabr0/from-keras-to-c - - -std::shared_ptr model_load_cpu(const char *filename, const char *input_name, const char *output_name, unsigned int threads){ - - int vis = setenv("CUDA_VISIBLE_DEVICES", "", 1); - if (vis == -1){ - std::cerr << "Suppression of GPU devices failed." << std::endl; - } +std::shared_ptr model_load_cpu(const char *saved_model_dir, unsigned int threads, const char *input_layer_name){ - //for CPU-only useage, the tensorflow gpu library will still print out warnings about not finding GPU/CUDA - suppress them here - int env = setenv("TF_CPP_MIN_LOG_LEVEL", "3", 1); - if (env == -1){ - std::cerr << "Suppression of Tensorflow logs and warnings failed." << std::endl; - } - - CStatus status; std::shared_ptr ms = std::make_shared(); - std::shared_ptr graph = std::make_shared(TF_NewGraph()); - - { - // Load a protobuf containing a GraphDef - auto graph_def=read_tf_buffer_from_file(filename); - if(!graph_def) return nullptr; - auto graph_opts=TF_NewImportGraphDefOptions(); - TF_GraphImportGraphDef(*(graph.get()), graph_def, graph_opts, status.ptr); - } - - if(status.failure()){ - status.dump_error(); - return nullptr; - } - ms -> graph = graph; + std::shared_ptr Graph = std::make_shared(TF_NewGraph()); - auto input_op = TF_GraphOperationByName(*(graph.get()), input_name); - auto output_op = TF_GraphOperationByName(*(graph.get()), output_name); - if(!input_op || !output_op){ - return nullptr; - } + TF_Status* Status = TF_NewStatus(); - TF_SessionOptions *opts = TF_NewSessionOptions(); + TF_SessionOptions* SessionOpts = TF_NewSessionOptions(); + TF_Buffer* RunOpts = NULL; - //set multithreading - //the following buffer is equivalent to - //config = tf.ConfigProto(allow_soft_placement=True,device_count = {'CPU':/2},intra_op_parallelism_threads=/2,inter_op_parallelism_threads=2) + //configure the model + CStatus status; uint8_t intra_op_parallelism_threads = std::max((unsigned int)1,threads/2); uint8_t inter_op_parallelism_threads = 2; uint8_t cpus = std::max((unsigned int)1,threads/2); uint8_t buf[]={0xa, 0x7, 0xa, 0x3, 0x43, 0x50, 0x55, 0x10, cpus, 0x10, intra_op_parallelism_threads, 0x28, inter_op_parallelism_threads, 0x38, 0x1}; + TF_SetConfig(SessionOpts, buf,sizeof(buf),status.ptr); + + if(status.failure()){ + std::cout << "Model configuration failed." << std::endl; + } - TF_SetConfig(opts, buf,sizeof(buf),status.ptr); + const char* tags = "serve"; // default model serving tag; can change in future + int ntags = 1; - std::shared_ptr session = std::make_shared(TF_NewSession(*(graph.get()), opts, status.ptr)); + std::shared_ptr session = std::make_shared(TF_LoadSessionFromSavedModel(SessionOpts, RunOpts, saved_model_dir, &tags, ntags, *(Graph.get()), NULL, Status)); + if(TF_GetCode(Status) != TF_OK){ + printf("%s",TF_Message(Status)); + } - if(status.failure()){ + auto input_op = TF_GraphOperationByName(*(Graph.get()), input_layer_name); + auto output_op = TF_GraphOperationByName(*(Graph.get()), "StatefulPartitionedCall"); + if(!output_op){ + std::cout << "bad output name" << std::endl; return nullptr; } - assert(session); + if(!input_op){ + std::cout << "bad input name" << std::endl; + return nullptr; + } + ms -> session = session; ms -> inputs = {input_op, 0}; @@ -95,50 +60,47 @@ std::shared_ptr model_load_cpu(const char *filename, const char *i } -std::shared_ptr model_load_gpu(const char *filename, const char *input_name, const char *output_name, unsigned char device, unsigned int threads){ +std::shared_ptr model_load_gpu(const char *saved_model_dir, unsigned char device, unsigned int threads,const char *input_layer_name){ - CStatus status; std::shared_ptr ms = std::make_shared(); - std::shared_ptr graph = std::make_shared(TF_NewGraph()); - - { - // Load a protobuf containing a GraphDef - auto graph_def=read_tf_buffer_from_file(filename); - if(!graph_def) return nullptr; - auto graph_opts=TF_NewImportGraphDefOptions(); - TF_GraphImportGraphDef(*(graph.get()), graph_def, graph_opts, status.ptr); - } + std::shared_ptr Graph = std::make_shared(TF_NewGraph()); - if(status.failure()){ - status.dump_error(); - return nullptr; - } - ms -> graph = graph; + TF_Status* Status = TF_NewStatus(); - auto input_op = TF_GraphOperationByName(*(graph.get()), input_name); - auto output_op = TF_GraphOperationByName(*(graph.get()), output_name); - if(!input_op || !output_op){ - return nullptr; - } + TF_SessionOptions* SessionOpts = TF_NewSessionOptions(); + TF_Buffer* RunOpts = NULL; - //the buffer that follows is equivalent to: - //config = tf.ConfigProto(allow_soft_placement=True,log_device_placement=False,device_count = {'GPU': 1,'CPU':},intra_op_parallelism_threads=/2,inter_op_parallelism_threads=2) - //config.gpu_options.allow_growth=True - //config.gpu_options.visible_device_list= + //configure the model + CStatus status; uint8_t intra_op_parallelism_threads = 1; uint8_t inter_op_parallelism_threads = threads; uint8_t cpus = threads; - uint8_t buf[]={0xa, 0x7, 0xa, 0x3, 0x43, 0x50, 0x55, 0x10, cpus, 0xa, 0x7, 0xa, 0x3, 0x47, 0x50, 0x55, 0x10, 0x1, 0x10, intra_op_parallelism_threads, 0x28, inter_op_parallelism_threads, 0x32, 0x5, 0x20, 0x1, 0x2a, 0x1, device, 0x38, 0x1}; + uint8_t buf[]={0xa, 0x7, 0xa, 0x3, 0x43, 0x50, 0x55, 0x10, cpus, 0xa, 0x7, 0xa, 0x3, 0x47, 0x50, 0x55, 0x10, 0x1, 0x10, intra_op_parallelism_threads, 0x28, inter_op_parallelism_threads, 0x32, 0x5, 0x20, 0x1, 0x2a, 0x1, (unsigned char) device, 0x38, 0x1}; + TF_SetConfig(SessionOpts, buf,sizeof(buf),status.ptr); + + if(status.failure()){ + std::cout << "Model configuration failed." << std::endl; + } - TF_SessionOptions *opts = TF_NewSessionOptions(); - TF_SetConfig(opts, buf,sizeof(buf),status.ptr); - //TF_EnableXLACompilation(opts,true); - std::shared_ptr session = std::make_shared(TF_NewSession(*(graph.get()), opts, status.ptr)); + const char* tags = "serve"; // default model serving tag; can change in future + int ntags = 1; - if(status.failure()){ + std::shared_ptr session = std::make_shared(TF_LoadSessionFromSavedModel(SessionOpts, RunOpts, saved_model_dir, &tags, ntags, *(Graph.get()), NULL, Status)); + if(TF_GetCode(Status) != TF_OK){ + printf("%s",TF_Message(Status)); + } + + auto input_op = TF_GraphOperationByName(*(Graph.get()), input_layer_name); + auto output_op = TF_GraphOperationByName(*(Graph.get()), "StatefulPartitionedCall"); + if(!output_op){ + std::cout << "bad output name" << std::endl; + return nullptr; + } + if(!input_op){ + std::cout << "bad input name" << std::endl; return nullptr; } - assert(session); + ms -> session = session; ms -> inputs = {input_op, 0}; diff --git a/src/tensor.h b/src/tensor.h index 72d10cb..c0d822d 100644 --- a/src/tensor.h +++ b/src/tensor.h @@ -1,5 +1,5 @@ //---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford +// Copyright 2020 University of Cambridge // Written by Michael A. Boemo (mb915@cam.ac.uk) // This software is licensed under GPL-3.0. You should have // received a copy of the license with this software. If @@ -98,7 +98,7 @@ template typename TFObjMeta::UniquePtr tf_obj_unique_ptr(T *obj){ class ModelSession{ public: std::shared_ptr graph; - std::shared_ptr session; + std::shared_ptr session; TF_Output inputs, outputs; }; @@ -128,8 +128,8 @@ struct TensorShape{ //end adapted from https://github.com/aljabr0/from-keras-to-c -std::shared_ptr model_load_cpu(const char *filename, const char *input_name, const char *output_name, unsigned int threads); -std::shared_ptr model_load_gpu(const char *filename, const char *input_name, const char *output_name, unsigned char device, unsigned int threads); +std::shared_ptr model_load_cpu(const char *filename, unsigned int threads, const char *); +std::shared_ptr model_load_gpu(const char *filename, unsigned char device, unsigned int threads, const char *); #endif diff --git a/src/trainCNN.cpp b/src/trainCNN.cpp index c635fa9..46c2788 100755 --- a/src/trainCNN.cpp +++ b/src/trainCNN.cpp @@ -25,10 +25,11 @@ #include "alignment.h" #include "htsInterface.h" #include "error_handling.h" +#include "tensor.h" static const char *help= -"trainCNN: DNAscent executable that generates HMM bootstrapped calls to build training data for DNAscent ResNet training.\n" +"trainCNN: DNAscent executable that generates HMM or CNN bootstrapped calls to build training data for DNAscent ResNet training.\n" "Note: This executable is geared towards developers and advanced users.\n" "To run DNAscent trainCNN, do:\n" " DNAscent trainCNN -b /path/to/alignment.bam -r /path/to/reference.fasta -i /path/to/index.dnascent -o /path/to/output.detect\n" @@ -41,7 +42,10 @@ static const char *help= " -t,--threads number of threads (default is 1 thread),\n" " -m,--maxReads maximum number of reads to consider,\n" " -q,--quality minimum mapping quality (default is 20),\n" -" -l,--length minimum read length in bp (default is 100).\n" +" -l,--length minimum read length in bp (default is 100),\n" +" --HMM use HMM bootstrapping (default is CNN),\n" +" --useRaw write raw signal instead of events.\n" + "Written by Michael Boemo, Department of Pathology, University of Cambridge.\n" "Please submit bug reports to GitHub Issues (https://github.com/MBoemo/DNAscent/issues)."; @@ -50,7 +54,7 @@ struct Arguments { std::string referenceFilename; std::string outputFilename; std::string indexFilename; - bool methylAware, capReads; + bool methylAware, capReads, useRaw, useHMM; double divergence; int minQ, maxReads; int minL; @@ -86,8 +90,10 @@ Arguments parseDataArguments( int argc, char** argv ){ args.methylAware = false; args.divergence = 0; args.capReads = false; + args.useRaw = false; args.maxReads = 0; args.dilation = 1.0; + args.useHMM = false; /*parse the command line arguments */ @@ -161,6 +167,16 @@ Arguments parseDataArguments( int argc, char** argv ){ args.methylAware = true; i+=1; } + else if ( flag == "--HMM" ){ + + args.useHMM = true; + i+=1; + } + else if ( flag == "--useRaw" ){ + + args.useRaw = true; + i+=1; + } else throw InvalidOption( flag ); } if (args.outputFilename == args.indexFilename or args.outputFilename == args.referenceFilename or args.outputFilename == args.bamFilename) throw OverwriteFailure(); @@ -175,6 +191,13 @@ int data_main( int argc, char** argv ){ Arguments args = parseDataArguments( argc, argv ); bool bulkFast5; + //get the neural network model path + std::string pathExe = getExePath(); + std::string modelPath = pathExe + "dnn_models/detect_model_BrdUEdU/"; + std::string input_layer_name = "serving_default_input_1"; + + std::shared_ptr session = model_load_cpu(modelPath.c_str(), args.threads, input_layer_name.c_str()); + //load DNAscent index std::map< std::string, std::string > readID2path; parseIndex( args.indexFilename, readID2path, bulkFast5 ); @@ -212,7 +235,6 @@ int data_main( int argc, char** argv ){ const char *allReads = "."; itr = sam_itr_querys(bam_idx,bam_hdr,allReads); - unsigned int windowLength = 10; int result; int failedEvents = 0; unsigned int maxBufferSize; @@ -242,7 +264,7 @@ int data_main( int argc, char** argv ){ /*if we've filled up the buffer with short reads, compute them in parallel */ if (buffer.size() >= maxBufferSize or (buffer.size() > 0 and result == -1 ) ){ - #pragma omp parallel for schedule(dynamic) shared(buffer,windowLength,analogueModel,thymidineModel,args,prog,failed) num_threads(args.threads) + #pragma omp parallel for schedule(dynamic) shared(buffer,analogueModel,thymidineModel,args,prog,failed) num_threads(args.threads) for (unsigned int i = 0; i < buffer.size(); i++){ read r; @@ -287,8 +309,18 @@ int data_main( int argc, char** argv ){ continue; } - std::map BrdUCalls = llAcrossRead_forTraining( r, windowLength); - std::string readOut = eventalign_train( r, 100, BrdUCalls, args.dilation); + std::map> analogueCalls; + + unsigned int windowLength_align = 50; + std::pair> ar = eventalign_detect( r, windowLength_align, 1.0 ); + if (not ar.first){ + failed++; + prog++; + continue; + } + analogueCalls = runCNN_training(ar.second,session); + + std::string readOut = eventalign_train( r, 100, analogueCalls, args.dilation, args.useRaw); #pragma omp critical { diff --git a/src/trainGMM.cpp b/src/trainGMM.cpp index ec006f5..c9d3896 100644 --- a/src/trainGMM.cpp +++ b/src/trainGMM.cpp @@ -1,5 +1,5 @@ //---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford +// Copyright 2019 University of Oxford // Written by Michael A. Boemo (mb915@cam.ac.uk) // This software is licensed under GPL-3.0. You should have // received a copy of the license with this software. If @@ -136,7 +136,7 @@ std::vector findNeighbours( std::vector &events, double ev, double } -std::map DBSCAN( std::vector< double > events, double epsilon, unsigned int minPoints ){ +std::map DBSCAN( std::vector< double > &events, double epsilon, unsigned int minPoints ){ //labels //-2 := undefined @@ -425,7 +425,7 @@ int train_main( int argc, char** argv ){ std::istringstream ss( line ); std::string sixMer, entry; - double eventMean = 0.0, eventLength = 0.0; + double eventMean = 0.0; int col = 0; while ( std::getline( ss, entry, '\t' ) ){ @@ -439,16 +439,12 @@ int train_main( int argc, char** argv ){ eventMean = atof( entry.c_str() ); } - else if ( col == 3 ){ - - eventLength = atof( entry.c_str() ); - } col++; } - assert (eventMean != 0.0 and eventLength != 0.0); + assert (eventMean != 0.0); - if ( eventLength >= 0.002 and importedEvents[sixMer2index(sixMer)].size() < trainArgs.maxEvents ){ + if ( importedEvents[sixMer2index(sixMer)].size() < trainArgs.maxEvents ){ importedEvents[sixMer2index(sixMer)].push_back( eventMean ); } diff --git a/src/trainGMM.h b/src/trainGMM.h index e64ef22..4bcccd0 100644 --- a/src/trainGMM.h +++ b/src/trainGMM.h @@ -1,5 +1,5 @@ //---------------------------------------------------------- -// Copyright 2019-2020 University of Oxford +// Copyright 2019 University of Oxford // Written by Michael A. Boemo (mb915@cam.ac.uk) // This software is licensed under GPL-3.0. You should have // received a copy of the license with this software. If @@ -12,6 +12,6 @@ /*function prototypes */ int train_main( int argc, char** argv ); std::vector< double > gaussianMixtureEM( double, double, double, double, double, std::vector< double > &, double, int ); -std::map DBSCAN( std::vector< double >, double, int ); +std::map DBSCAN( std::vector< double > &, double, unsigned int ); #endif diff --git a/tests/detect/bias_6mers/HMM/README.md b/tests/detect/bias_6mers/HMM/README.md deleted file mode 100644 index 679ec8c..0000000 --- a/tests/detect/bias_6mers/HMM/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# DNAscent Test - 6mer Bias - -This test checks whether there is a bias in the 6mers that DNAscent detects. We want to make sure that DNAscent is using context effects correctly so that even if a particular BrdU-containing 6mer doesn't shift the signal much from the thymidine-only case, using the flanking sequence still means that this 6mer can be detected. - -## Files - -`callsAgainstKL.py` - -## Running - -Run DNAscent detect, ideally on the 0-40-60-80-100% BrdU data released with the Nature Methods paper. Usage for the script is: -`python callsAgainstKL.py /path/to/DNAscentDetect.out /path/to/DNAscent/pore_models/BrdU_full_noThreshold.model` -This will produce a scatter plot, where each point is a different 6mer. The x-axis is the fraction of times the 6mer was called as BrdU, and the y-axis is the KL-divergence of the BrdU-containing 6mer against the thymidine-only pore model. If well-behaved, the scatter plot should look somewhat like a vertical bar, which is to say regardless of the KL-divergence, we're still positively identifying each 6mer with about the same frequency. diff --git a/tests/detect/bias_6mers/HMM/callsAgainstKL.py b/tests/detect/bias_6mers/HMM/callsAgainstKL.py deleted file mode 100644 index 957b6f6..0000000 --- a/tests/detect/bias_6mers/HMM/callsAgainstKL.py +++ /dev/null @@ -1,80 +0,0 @@ -import matplotlib -matplotlib.use('Agg') -from matplotlib import pyplot as plt -import sys - -#Usage: python callsAgainstKL.py out.detect DNAscent/pore_models/BrdU_full_noThreshold.model - - -def reverseComplement(seq): - newSeq = '' - for s in seq: - if s == 'A': - newSeq += 'T' - elif s == 'T': - newSeq += 'A' - elif s == 'C': - newSeq += 'G' - elif s == 'G': - newSeq += 'C' - else: - warnings.warn("Nucleotides must be A, T, G, or C.") - sys.exit() - - return newSeq[::-1] - - -callThreshold = 2.5 -#parse the detect file -kmer2calls = {} -kmer2attempts = {} -f = open(sys.argv[1],'r') -for line in f: - if line[0] == '>': - [readID, chromosome, start, end, strand] = line.rstrip().split() - else: - [pos, score, kmerRef, kmerRead] = line.rstrip().split() - - if strand == 'rev': - kmerRef = reverseComplement(kmerRef) - - if kmerRef not in kmer2attempts: - kmer2attempts[kmerRef] = 1 - else: - kmer2attempts[kmerRef] += 1 - - score = float(score) - if score < callThreshold: - continue - - if kmerRef not in kmer2calls: - kmer2calls[kmerRef] = 1 - else: - kmer2calls[kmerRef] += 1 -f.close() - -#parse the model file -kmer2KL = {} -f = open(sys.argv[2],'r') -for line in f: - splitLine = line.rstrip().split() - KL = float(splitLine[-1:][0]) - kmer = splitLine[0] - kmer2KL[kmer] = KL -f.close() - -#reshape -x = [] -y = [] -for kmer in kmer2calls: - if kmer in kmer2KL: - x.append(float(kmer2calls[kmer])/kmer2attempts[kmer]) - y.append(kmer2KL[kmer]) - -#plot -plt.figure() -plt.scatter(x,y,alpha=0.2) -plt.xlabel('(Positive Calls)/(Attempts)') -plt.ylabel('KL-Divergence') -plt.savefig('Calls_vs_Divergence.pdf') - diff --git a/tests/detect/bias_6mers/README.md b/tests/detect/bias_6mers/README.md deleted file mode 100644 index 679ec8c..0000000 --- a/tests/detect/bias_6mers/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# DNAscent Test - 6mer Bias - -This test checks whether there is a bias in the 6mers that DNAscent detects. We want to make sure that DNAscent is using context effects correctly so that even if a particular BrdU-containing 6mer doesn't shift the signal much from the thymidine-only case, using the flanking sequence still means that this 6mer can be detected. - -## Files - -`callsAgainstKL.py` - -## Running - -Run DNAscent detect, ideally on the 0-40-60-80-100% BrdU data released with the Nature Methods paper. Usage for the script is: -`python callsAgainstKL.py /path/to/DNAscentDetect.out /path/to/DNAscent/pore_models/BrdU_full_noThreshold.model` -This will produce a scatter plot, where each point is a different 6mer. The x-axis is the fraction of times the 6mer was called as BrdU, and the y-axis is the KL-divergence of the BrdU-containing 6mer against the thymidine-only pore model. If well-behaved, the scatter plot should look somewhat like a vertical bar, which is to say regardless of the KL-divergence, we're still positively identifying each 6mer with about the same frequency. diff --git a/tests/detect/bias_6mers/ResNet/README.md b/tests/detect/bias_6mers/ResNet/README.md deleted file mode 100644 index 2cfbebf..0000000 --- a/tests/detect/bias_6mers/ResNet/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# DNAscent Test - 6mer Bias - -This test checks whether there is a bias in the 6mers that DNAscent detects. We want to make sure that DNAscent doesn't aggressively call BrdU just because a 6mer has a high number of thymidines in it. - -## Files - -`callsAgainstKL.py` - -## Running - -Run DNAscent detect, ideally on the 0-40-60-80-100% BrdU data released with the Nature Methods paper. Usage for the script is: -`python callsAgainstTcontent.py /path/to/DNAscentDetect.out` -This will produce a bar plot, where each bar plot shows the average BrdU probability for 6mers with a given number of thymidines. diff --git a/tests/detect/bias_6mers/ResNet/callsAgainstTcontent.py b/tests/detect/bias_6mers/ResNet/callsAgainstTcontent.py deleted file mode 100644 index 7cddae5..0000000 --- a/tests/detect/bias_6mers/ResNet/callsAgainstTcontent.py +++ /dev/null @@ -1,73 +0,0 @@ -import matplotlib -matplotlib.use('Agg') -from matplotlib import pyplot as plt -import sys -import numpy as np - -#Usage: python callAgainstTcontent.py out.detect - -maxReads = 1000 - -def reverseComplement(seq): - newSeq = '' - for s in seq: - if s == 'A': - newSeq += 'T' - elif s == 'T': - newSeq += 'A' - elif s == 'C': - newSeq += 'G' - elif s == 'G': - newSeq += 'C' - else: - warnings.warn("Nucleotides must be A, T, G, or C.") - sys.exit() - - return newSeq[::-1] - - -#parse the detect file -thymCount2probs = {} -readCtr = 0 -f = open(sys.argv[1],'r') -for line in f: - if line[0] == '#': - continue - - if line[0] == '>': - [readID, chromosome, start, end, strand] = line.rstrip().split() - - readCtr += 1 - if readCtr > maxReads: - break - else: - [pos, BrdUprob, kmerRef] = line.rstrip().split() - - if strand == 'rev': - kmerRef = reverseComplement(kmerRef) - - key = kmerRef.count('T') - - if key not in thymCount2probs: - thymCount2probs[key] = [float(BrdUprob)] - else: - thymCount2probs[key].append(float(BrdUprob)) -f.close() - -#reshape -x = [] -y = [] -y_err = [] -for i in range(1,7): - - x.append(i) - y.append(np.mean(thymCount2probs[i])) - y_err.append(np.std(thymCount2probs[i])) - -#plot -plt.figure() -plt.errorbar(x,y,yerr=y_err) -plt.xlabel('Number of Thymidines in Sixmer') -plt.ylabel('Average Called BrdU Probability') -plt.savefig('Calls_vs_Thymidines.pdf') - diff --git a/tests/detect/bias_6mers/callsAgainstKL.py b/tests/detect/bias_6mers/callsAgainstKL.py deleted file mode 100644 index 46c3025..0000000 --- a/tests/detect/bias_6mers/callsAgainstKL.py +++ /dev/null @@ -1,80 +0,0 @@ -import matplotlib -matplotlib.use('Agg') -from matplotlib import pyplot as plt -import sys - -#Usage: python callsAgainstKL.py out.detect DNAscent/pore_models/BrdU_full_noThreshold.model - - -def reverseComplement(seq): - newSeq = '' - for s in seq: - if s == 'A': - newSeq += 'T' - elif s == 'T': - newSeq += 'A' - elif s == 'C': - newSeq += 'G' - elif s == 'G': - newSeq += 'C' - else: - warnings.warn("Nucleotides must be A, T, G, or C.") - sys.exit() - - return newSeq[::-1] - - -callThreshold = 2.5 -#parse the detect file -kmer2calls = {} -kmer2attempts = {} -f = open(sys.argv[1],'r') -for line in f: - if line[0] == '>': - [readID, chromosome, start, end, strand] = line.rstrip().split() - else: - [pos, score, kmerRef, kmerRead] = line.rstrip().split() - - if kmerRef not in kmer2attempts: - kmer2attempts[kmerRef] = 1 - else: - kmer2attempts[kmerRef] += 1 - - score = float(score) - if score < callThreshold: - continue - - if strand == 'rev': - kmerRef = reverseComplement(kmerRef) - - if kmerRef not in kmer2calls: - kmer2calls[kmerRef] = 1 - else: - kmer2calls[kmerRef] += 1 -f.close() - -#parse the model file -kmer2KL = {} -f = open(sys.argv[2],'r') -for line in f: - splitLine = line.rstrip().split() - KL = float(splitLine[-1:][0]) - kmer = splitLine[0] - kmer2KL[kmer] = KL -f.close() - -#reshape -x = [] -y = [] -for kmer in kmer2calls: - if kmer in kmer2KL: - x.append(float(kmer2calls[kmer])/kmer2attempts[kmer]) - y.append(kmer2KL[kmer]) - -#plot -plt.figure() -plt.scatter(x,y,alpha=0.2) -plt.xlabel('(Positive Calls)/(Attempts)') -plt.ylabel('KL-Divergence') -plt.savefig('Calls_vs_Divergence.pdf') - diff --git a/tests/detect/event_alignment/README.md b/tests/detect/event_alignment/README.md deleted file mode 100644 index 9b51f1a..0000000 --- a/tests/detect/event_alignment/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# DNAscent Test - Event Alignment - -This test checks the quality of DNAscent's adaptive banded event alignment and makes sure that there are no features of the alignment that could indicate a whole read should be thrown out or a call should be avoided at a particular position. - -## Files - -`plotEventAlignments.py` - -## Running - -Set #define TEST_ALIGNMENT 1 in detect.cpp and recompile. Run DNAscent detect on 2018_06_18_CAM_ONT_gDNA_BrdU_40_60_80_100_full barcode08 and barcode11, redirecting stderr to a file. For each of these, run `python plotEventAlignments.py stderr.out DNAscentDetect.out`. diff --git a/tests/detect/event_alignment/plotEventAlignments.py b/tests/detect/event_alignment/plotEventAlignments.py deleted file mode 100644 index 1b38ffc..0000000 --- a/tests/detect/event_alignment/plotEventAlignments.py +++ /dev/null @@ -1,238 +0,0 @@ -import matplotlib -matplotlib.use('Agg') -from matplotlib import pyplot as plt -import numpy as np -import sys - -#Usage: python plotEventAlignments.py stderr.out DNAscent.detect -#where stderr.out is the result of a stderr redirect after running DNAscent detect with #define TEST_ALIGNMENT 1 - -maxReads = 2000 -plot = False - -threshold = 1.25 #log likelihood threshold for a positive BrdU call (this should be the default value used by DNAscent regions) - -hist_emission = [] -hist_gap = [] -readID2emission = {} -readID2gap = {} -readID2residual = {} -readID2shift = {} -readID2scale = {} -readID2var = {} - -x=[] -y=[] -progress = 0 - -#go through the alignments in order to: -# -plot the distribution of log emissions and gaps, -# -make individual plots of the alignments (if plot = True), -# -create a map from readID to emission so that we can crosscheck that against positive analogue call rates. -f = open(sys.argv[1],'r') -for line in f: - - if line[0] == '>': - - progress += 1 - if progress % 100 == 0: - print(float(progress)/maxReads) - if progress >= maxReads: - break - - if len(x) > 0 and plot and progress < 10: - plt.figure() - plt.plot(x,y) - plt.xlabel('Event Index') - plt.ylabel('kmer Index') - plt.savefig(readID+'.png') - plt.close() - - if len(x) > 0: - - residual = np.polyfit(x,y,1,full=True) - readID2residual[readID] = residual[1][0]/len(x) - - readID = line.rstrip()[1:] - x=[] - y=[] - - else: - - splitLine = line.rstrip().split() - if splitLine[0] == 'avg_log_emission': - hist_emission.append(float(splitLine[1])) - readID2emission[readID] = float(splitLine[1]) - elif splitLine[0] == 'spanned': - continue - elif splitLine[0] == 'maxGap': - hist_gap.append(int(splitLine[1])) - readID2gap[readID] = float(splitLine[1]) - elif splitLine[0] == 'shift': - readID2shift[readID] = float(splitLine[1]) - elif splitLine[0] == 'scale': - readID2scale[readID] = float(splitLine[1]) - elif splitLine[0] == 'var': - readID2var[readID] = float(splitLine[1]) - elif splitLine[0] == 'drift': - continue - else: - [eventIdx,kmerIdx] = splitLine - x.append(int(eventIdx)) - y.append(int(kmerIdx)) -f.close() - -plt.figure() -plt.hist(hist_emission, 50) -plt.xlabel('Average Log Emission') -plt.ylabel('Count') -plt.savefig('hist_log_emission.pdf') -plt.close() - -plt.figure() -plt.hist(hist_gap, 50) -plt.xlabel('Maximum Gap') -plt.ylabel('Count') -plt.savefig('hist_gap.pdf') -plt.close() - -f = open(sys.argv[2],'r') -readID2callFraction = {} -first = True -progress = 0 -for line in f: - - if line[0] == '>': - - progress += 1 - if progress % 100 == 0: - print(float(progress)/maxReads) - if progress >= maxReads: - break - - if not first: - readID2callFraction[readID] = float(brduCalls)/float(numAttempts) - - else: - first = False - splitLine = line.rstrip().split() - readID = splitLine[0][1:] - - numAttempts = 0 - brduCalls = 0 - methylCalls = 0 - brduMethyDeclined = 0 - - else: - - splitLine = line.rstrip().split() - logLikelihood = float(splitLine[1]) - if len(splitLine) > 4: - logLikelihood_BrdUvsMethyl = float(splitLine[2]) - logLikelihood_MethylvsThym = float(splitLine[3]) - - if logLikelihood > threshold and logLikelihood_BrdUvsMethyl < threshold: - brduMethyDeclined += 1 - elif logLikelihood > threshold and logLikelihood_BrdUvsMethyl > threshold: - brduCalls += 1 - elif logLikelihood_MethylvsThym > threshold and logLikelihood_BrdUvsMethyl < threshold: - methylCalls += 1 - numAttempts += 1 - - - else: - if logLikelihood > threshold: - brduCalls += 1 - numAttempts += 1 -f.close() - -hist_x = [] -hist_y = [] - -for ID in readID2callFraction: - if ID in readID2emission: - - hist_x.append(readID2callFraction[ID]) - hist_y.append(readID2emission[ID]) - -plt.figure() -plt.scatter(hist_x,hist_y,alpha=0.5) -plt.xlabel('BrdU Calls (Calls/Attempts)') -plt.ylabel('Average Log Emission') -plt.savefig('scatter_emission_vs_calls.pdf') -plt.close() - -hist_x = [] -hist_y = [] -for ID in readID2callFraction: - if ID in readID2gap: - - hist_x.append(readID2callFraction[ID]) - hist_y.append(readID2gap[ID]) - -plt.figure() -plt.scatter(hist_x,hist_y,alpha=0.5) -plt.xlabel('BrdU Calls (Calls/Attempts)') -plt.ylabel('Max Gap') -plt.savefig('scatter_gap_vs_calls.pdf') -plt.close() - -hist_x = [] -hist_y = [] -for ID in readID2callFraction: - if ID in readID2residual: - - hist_x.append(readID2callFraction[ID]) - hist_y.append(readID2residual[ID]) - -plt.figure() -plt.scatter(hist_x,hist_y,alpha=0.5) -plt.xlabel('BrdU Calls (Calls/Attempts)') -plt.ylabel('Residual / Num Events') -plt.savefig('scatter_residual_vs_calls.pdf') -plt.close() - -hist_x = [] -hist_y = [] -for ID in readID2callFraction: - if ID in readID2shift: - - hist_x.append(readID2callFraction[ID]) - hist_y.append(readID2shift[ID]) - -plt.figure() -plt.scatter(hist_x,hist_y,alpha=0.5) -plt.xlabel('BrdU Calls (Calls/Attempts)') -plt.ylabel('Shift') -plt.savefig('scatter_shift_vs_calls.pdf') -plt.close() - -hist_x = [] -hist_y = [] -for ID in readID2callFraction: - if ID in readID2scale: - - hist_x.append(readID2callFraction[ID]) - hist_y.append(readID2scale[ID]) - -plt.figure() -plt.scatter(hist_x,hist_y,alpha=0.5) -plt.xlabel('BrdU Calls (Calls/Attempts)') -plt.ylabel('Scale') -plt.savefig('scatter_scale_vs_calls.pdf') -plt.close() - -hist_x = [] -hist_y = [] -for ID in readID2callFraction: - if ID in readID2var: - - hist_x.append(readID2callFraction[ID]) - hist_y.append(readID2var[ID]) - -plt.figure() -plt.scatter(hist_x,hist_y,alpha=0.5) -plt.xlabel('BrdU Calls (Calls/Attempts)') -plt.ylabel('Var') -plt.savefig('scatter_var_vs_calls.pdf') -plt.close() diff --git a/tests/detect/event_alignment/plotEventLengths.py b/tests/detect/event_alignment/plotEventLengths.py deleted file mode 100644 index d9ac659..0000000 --- a/tests/detect/event_alignment/plotEventLengths.py +++ /dev/null @@ -1,20 +0,0 @@ -import matplotlib -matplotlib.use('Agg') -from matplotlib import pyplot as plt -import sys - -#Usage: python stderr.out -#where stderr.out is the result of a stderr redirect after running DNAscent detect #define EVENT_LENGTHS 1 in event_handling.cpp - -lengths = [] -f = open(sys.argv[1],'r') -for line in f: - lengths.append(int(line.rstrip())) -f.close() - -plt.figure() -plt.hist(lengths,25,log=True) -plt.xlabel('Event Length') -plt.ylabel('Count') -plt.savefig('eventLengths.pdf') -plt.close() diff --git a/tests/detect/hmm_falsePositives/README.md b/tests/detect/hmm_falsePositives/README.md deleted file mode 100644 index 51231e1..0000000 --- a/tests/detect/hmm_falsePositives/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# DNAscent Test - Test HMM Positive Calls - -This test looks at events that were called as BrdU and checks them against the span on the query sequence and the number of events passed to the HMM to determine whether we can add any meaningful QCs. - -## Files - -`testPositiveCalls.py` - -## Running - -Run DNAscent detect with #define TEST_LL 1 and redirect stderr to file.txt. Then run `python testPositiveCalls.py file.txt` to produce the plots. diff --git a/tests/detect/hmm_falsePositives/testPositiveCalls.py b/tests/detect/hmm_falsePositives/testPositiveCalls.py deleted file mode 100644 index 5fbe9f1..0000000 --- a/tests/detect/hmm_falsePositives/testPositiveCalls.py +++ /dev/null @@ -1,122 +0,0 @@ -import matplotlib -matplotlib.use('Agg') -from matplotlib import pyplot as plt -import numpy as np -import sys - -#Usage: python testFalsePositives.py DNAscentDetect.stderr -#where DNAscentDetect.stderr is from stderr after running DNAscent detect with #define TEST_LL 1 - -llThreshold = 1.25 - -querySpan2positives = {} -querySpan2count = {} -events2positives = {} -events2count = {} - -KL_scatter = [] -ll_scatter = [] - -f = open(sys.argv[1],'r') -for ctr, line in enumerate(f): - - if line[0] == '<': - - if ctr > 1000000: - break - - idx = 0 - else: - idx += 1 - - if idx == 1: - KL = float(line.rstrip()) - KL_scatter.append(KL) - if idx == 2: - span = int(line.rstrip()) - elif idx == 3: - continue - - elif idx == 4: - events = len(line.rstrip().split()) - elif idx == 5: - ll = float(line.rstrip()) - ll_scatter.append(ll) - #look at query - if span in querySpan2positives: - querySpan2count[span] = querySpan2count[span] + 1 - if ll > llThreshold: - querySpan2positives[span] = querySpan2positives[span] + 1 - else: - querySpan2count[span] = 1 - if ll > llThreshold: - querySpan2positives[span] = 1 - else: - querySpan2positives[span] = 0 - - #look at event number - if events in events2positives: - events2count[events] += 1 - if ll > llThreshold: - events2positives[events] += 1 - else: - events2count[events] = 1 - if ll > llThreshold: - events2positives[events] = 1 - else: - events2positives[events] = 0 - -bar_x = [] -bar_y = [] -bar_z = [] -#normalise -for query in querySpan2count: - bar_x.append(query) - bar_y.append(float(querySpan2positives[query])/querySpan2count[query]) - bar_z.append(querySpan2count[query]) - -plt.figure() -plt.bar(bar_x,bar_y) -plt.xlabel('Span on Query') -plt.ylabel('Positive Calls / Attempts') -plt.savefig('falsePositives_querySpan.pdf') -plt.close() - -plt.figure() -plt.bar(bar_x,bar_z) -plt.xlabel('Span on Query') -plt.ylabel('Number of Call Positions') -plt.savefig('falsePositives_querySpan_positions.pdf') -plt.close() - - -bar_x = [] -bar_y = [] -bar_z = [] -#normalise -for events in events2count: - bar_x.append(events) - bar_y.append(float(events2positives[events])/events2count[events]) - bar_z.append(events2count[events]) - -plt.figure() -plt.bar(bar_x,bar_y) -plt.xlabel('Number of Events') -plt.ylabel('Positive Calls / Attempts') -plt.savefig('falsePositives_eventNumbers.pdf') -plt.close() - -plt.figure() -plt.bar(bar_x,bar_z) -plt.xlabel('Number of Events') -plt.ylabel('Number of Call Positions') -plt.savefig('falsePositives_eventNumbers_positions.pdf') -plt.close() - -plt.figure() -plt.scatter(KL_scatter,ll_scatter,alpha=0.3) -plt.xlabel('Running KL Divergence') -plt.ylabel('Log Likelihood Ratio') -plt.savefig('falsePositives_runningKL.pdf') -plt.close() - diff --git a/tests/detect/hmm_forward/README.md b/tests/detect/hmm_forward/README.md deleted file mode 100644 index e6830ed..0000000 --- a/tests/detect/hmm_forward/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# DNAscent Test - Test HMM Forward Algorithm - -This test crosschecks the implementation of the HMM forward algorithm in DNAscent detect against pomegranate (https://github.com/jmschrei/pomegranate) to make sure they agree. - -## Files - -`testHMMForward.py` -`testHMMProbeViterbi.py` -Note that pomegranate (https://github.com/jmschrei/pomegranate) also needs to be installed. - -## Running - -Run DNAscent detect on any sample with #define TEST_HMM 1. Redirect stderr to file.txt and then run `python testHMMForward.py file.txt' to check agreement. From left to right, the columns indicate: -* whether BrdU was used in the HMM (0 for thymidine only, 1 for BrdU), -* the log probability of the events from pomegranate, -* the log probability of the events from DNAscent detect. -On that same output file.txt, if it was run on 0% BrdU reads, run `python testHMMProbeViterbi.py` to see plots of Viterbi insertions and deletions and how they correlate with the log likelihood ratio. This is meant to look for any meaningful QCs that could be added. diff --git a/tests/detect/hmm_forward/detect_withPenthus.cpp b/tests/detect/hmm_forward/detect_withPenthus.cpp deleted file mode 100755 index d3781d3..0000000 --- a/tests/detect/hmm_forward/detect_withPenthus.cpp +++ /dev/null @@ -1,1153 +0,0 @@ -//---------------------------------------------------------- -// Copyright 2019 University of Oxford -// Written by Michael A. Boemo (michael.boemo@path.ox.ac.uk) -// This software is licensed under GPL-2.0. You should have -// received a copy of the license with this software. If -// not, please Email the author. -//---------------------------------------------------------- - -//#define TEST_HMM 1 -//#define TEST_LL 1 - -#include -#include "detect.h" -#include -#include -#include -#include "common.h" -#include "event_handling.h" -#include "probability.h" -#include "../fast5/include/fast5.hpp" -#include "poreModels.h" - - -#include "../Penthus/src/hmm.h" -#include "../Penthus/src/probability.h" -#include "../Penthus/src/states.h" - - -static const char *help= -"detect: DNAscent executable that detects BrdU in Oxford Nanopore reads.\n" -"To run DNAscent detect, do:\n" -" ./DNAscent detect [arguments]\n" -"Example:\n" -" ./DNAscent detect -b /path/to/alignment.bam -r /path/to/reference.fasta -i /path/to/index.index -o /path/to/output.out -t 20\n" -"Required arguments are:\n" -" -b,--bam path to alignment BAM file,\n" -" -r,--reference path to genome reference in fasta format,\n" -" -i,--index path to DNAscent index,\n" -" -o,--output path to output file that will be generated.\n" -"Optional arguments are:\n" -" -t,--threads number of threads (default is 1 thread),\n" -" --methyl-aware account for CpG, Dcm, and Dam methylation in BrdU calling,\n" -" -q,--quality minimum mapping quality (default is 20).\n" -" -l,--length minimum read length in bp (default is 100).\n" -"Written by Michael Boemo, Department of Pathology, University of Cambridge.\n" -"Please submit bug reports to GitHub Issues (https://github.com/MBoemo/DNAscent/issues)."; - -struct Arguments { - std::string bamFilename; - std::string referenceFilename; - std::string outputFilename; - std::string indexFilename; - bool excludeCpG, methylAware, testAlignment; - double divergence; - int minQ; - int minL; - unsigned int threads; -}; - -Arguments parseDetectArguments( int argc, char** argv ){ - - if( argc < 2 ){ - - std::cout << "Exiting with error. Insufficient arguments passed to DNAscent detect." << std::endl << help << std::endl; - exit(EXIT_FAILURE); - } - - if ( std::string( argv[ 1 ] ) == "-h" or std::string( argv[ 1 ] ) == "--help" ){ - - std::cout << help << std::endl; - exit(EXIT_SUCCESS); - } - else if( argc < 4 ){ - - std::cout << "Exiting with error. Insufficient arguments passed to DNAscent detect." << std::endl; - exit(EXIT_FAILURE); - } - - Arguments args; - - /*defaults - we'll override these if the option was specified by the user */ - args.threads = 1; - args.minQ = 20; - args.minL = 100; - args.methylAware = false; - args.divergence = 0; - args.testAlignment = false; - - /*parse the command line arguments */#include "../Penthus/src/hmm.h" -#include "../Penthus/src/probability.h" -#include "../Penthus/src/states.h" - for ( int i = 1; i < argc; ){ - - std::string flag( argv[ i ] ); - - if ( flag == "-b" or flag == "--bam" ){ - - std::string strArg( argv[ i + 1 ] ); - args.bamFilename = strArg; - i+=2; - } - else if ( flag == "-r" or flag == "--reference" ){ - - std::string strArg( argv[ i + 1 ] ); - args.referenceFilename = strArg; - i+=2; - } - else if ( flag == "-t" or flag == "--threads" ){ - - std::string strArg( argv[ i + 1 ] ); - args.threads = std::stoi( strArg.c_str() ); - i+=2; - } - else if ( flag == "-q" or flag == "--quality" ){ - - std::string strArg( argv[ i + 1 ] ); - args.minQ = std::stoi( strArg.c_str() ); - i+=2; - } - else if ( flag == "-l" or flag == "--length" ){ - - std::string strArg( argv[ i + 1 ] ); - args.minL = std::stoi( strArg.c_str() ); - i+=2; - } - else if ( flag == "-i" or flag == "--index" ){ - - std::string strArg( argv[ i + 1 ] ); - args.indexFilename = strArg; - i+=2; - } - else if ( flag == "-o" or flag == "--output" ){ - - std::string strArg( argv[ i + 1 ] ); - args.outputFilename = strArg; - i+=2; - } - else if ( flag == "--divergence" ){ - - std::string strArg( argv[ i + 1 ] ); - args.divergence = std::stof(strArg.c_str()); - i+=2; - } - else if ( flag == "--methyl-aware" ){ - - args.methylAware = true; - i+=1; - } - else if ( flag == "--testAlignment" ){ - - args.testAlignment = true; - i+=1; - } - else throw InvalidOption( flag ); - } - return args; -} - -//Initial transitions within modules (internal transitions) -static double internalM12I = 0.3475; -static double internalI2I = 0.5; -static double internalM12M1 = 0.4; - -//Initial transitions between modules (external transitions) -static double externalD2D = 0.3; -static double externalD2M1 = 0.7; -static double externalI2M1 = 0.5; -static double externalM12D = 0.0025; -static double externalM12M1 = 0.25; - -double sequenceProbability( std::vector &observations, - std::string &sequence, - size_t windowSize, - bool useBrdU, - PoreParameters scalings, - size_t BrdUStart, - size_t BrdUEnd ){ - - std::vector< double > I_curr(2*windowSize+1, NAN), D_curr(2*windowSize+1, NAN), M_curr(2*windowSize+1, NAN), I_prev(2*windowSize+1, NAN), D_prev(2*windowSize+1, NAN), M_prev(2*windowSize+1, NAN); - double firstI_curr = NAN, firstI_prev = NAN; - double start_curr = NAN, start_prev = 0.0; - - double matchProb, insProb; - - /*-----------INITIALISATION----------- */ - //transitions from the start state - D_prev[0] = lnProd( start_prev, eln( 0.25 ) ); - - //account for transitions between deletion states before we emit the first observation - for ( unsigned int i = 1; i < D_prev.size(); i++ ){ - - D_prev[i] = lnProd( D_prev[i-1], eln ( externalD2D ) ); - } - - - /*-----------RECURSION----------- */ - /*complexity is O(T*N^2) where T is the number of observations and N is the number of states */ - double level_mu, level_sigma; - for ( unsigned int t = 0; t < observations.size(); t++ ){ - - std::fill( I_curr.begin(), I_curr.end(), NAN ); - std::fill( M_curr.begin(), M_curr.end(), NAN ); - std::fill( D_curr.begin(), D_curr.end(), NAN ); - firstI_curr = NAN; - - std::string sixMer = sequence.substr(0, 6); - - level_mu = scalings.shift + scalings.scale * thymidineModel.at(sixMer).first; - level_sigma = scalings.var * thymidineModel.at(sixMer).second; - - //uncomment to scale events - //level_mu = thymidineModel.at(sixMer).first; - //level_sigma = scalings.var / scalings.scale * thymidineModel.at(sixMer).second; - //observations[t] = (observations[t] - scalings.shift) / scalings.scale; - - matchProb = eln( normalPDF( level_mu, level_sigma, observations[t] ) ); - insProb = eln( uniformPDF( 0, 250, observations[t] ) ); - - //first insertion - firstI_curr = lnSum( firstI_curr, lnProd( lnProd( start_prev, eln( 0.25 ) ), insProb ) ); //start to first I - firstI_curr = lnSum( firstI_curr, lnProd( lnProd( firstI_prev, eln( 0.25 ) ), insProb ) ); //first I to first I - - //to the base 1 insertion - I_curr[0] = lnSum( I_curr[0], lnProd( lnProd( I_prev[0], eln( internalI2I ) ), insProb ) ); //I to I - I_curr[0] = lnSum( I_curr[0], lnProd( lnProd( M_prev[0], eln( internalM12I ) ), insProb ) ); //M to I - - //to the base 1 match - M_curr[0] = lnSum( M_curr[0], lnProd( lnProd( firstI_prev, eln( 0.5 ) ), matchProb ) ); //first I to first match - M_curr[0] = lnSum( M_curr[0], lnProd( lnProd( M_prev[0], eln( internalM12M1 ) ), matchProb ) ); //M to M - M_curr[0] = lnSum( M_curr[0], lnProd( lnProd( start_prev, eln( 0.5 ) ), matchProb ) ); //start to M - - //to the base 1 deletion - D_curr[0] = lnSum( D_curr[0], lnProd( NAN, eln( 0.25 ) ) ); //start to D - D_curr[0] = lnSum( D_curr[0], lnProd( firstI_curr, eln( 0.25 ) ) ); //first I to first deletion - - //the rest of the sequence - for ( unsigned int i = 1; i < I_curr.size(); i++ ){ - - //get model parameters - sixMer = sequence.substr(i, 6); - insProb = eln( uniformPDF( 0, 250, observations[t] ) ); - if ( useBrdU and BrdUStart - 5 <= i and i <= BrdUEnd and sixMer.find('T') != std::string::npos and analogueModel.count(sixMer) > 0 ){ - - level_mu = scalings.shift + scalings.scale * analogueModel.at(sixMer).first; - level_sigma = scalings.var * analogueModel.at(sixMer).second; - - //uncomment if you scale events - //level_mu = analogueModel.at(sixMer).first; - //level_sigma = scalings.var / scalings.scale * analogueModel.at(sixMer).second; - - matchProb = eln( normalPDF( level_mu, level_sigma, observations[t] ) ); - } - else{ - - level_mu = scalings.shift + scalings.scale * thymidineModel.at(sixMer).first; - level_sigma = scalings.var * thymidineModel.at(sixMer).second; - - //uncomment if you scale events - //level_mu = thymidineModel.at(sixMer).first; - //level_sigma = scalings.var / scalings.scale * thymidineModel.at(sixMer).second; - - matchProb = eln( normalPDF( level_mu, level_sigma, observations[t] ) ); - } - - //to the insertion - I_curr[i] = lnSum( I_curr[i], lnProd( lnProd( I_prev[i], eln( internalI2I ) ), insProb ) ); //I to I - I_curr[i] = lnSum( I_curr[i], lnProd( lnProd( M_prev[i], eln( internalM12I ) ), insProb ) ); //M to I - - //to the match - M_curr[i] = lnSum( M_curr[i], lnProd( lnProd( I_prev[i-1], eln( externalI2M1 ) ), matchProb ) ); //external I to M - M_curr[i] = lnSum( M_curr[i], lnProd( lnProd( M_prev[i-1], eln( externalM12M1 ) ), matchProb ) ); //external M to M - M_curr[i] = lnSum( M_curr[i], lnProd( lnProd( M_prev[i], eln( internalM12M1 ) ), matchProb ) ); //interal M to M - M_curr[i] = lnSum( M_curr[i], lnProd( lnProd( D_prev[i-1], eln( externalD2M1 ) ), matchProb ) ); //external D to M - } - - for ( unsigned int i = 1; i < I_curr.size(); i++ ){ - - //to the deletion - D_curr[i] = lnSum( D_curr[i], lnProd( M_curr[i-1], eln( externalM12D ) ) ); //external M to D - D_curr[i] = lnSum( D_curr[i], lnProd( D_curr[i-1], eln( externalD2D ) ) ); //external D to D - } - - I_prev = I_curr; - M_prev = M_curr; - D_prev = D_curr; - firstI_prev = firstI_curr; - start_prev = start_curr; - } - - - /*-----------TERMINATION----------- */ - double forwardProb = NAN; - - forwardProb = lnSum( forwardProb, lnProd( D_curr.back(), eln( 1.0 ) ) ); //D to end - forwardProb = lnSum( forwardProb, lnProd( M_curr.back(), eln( externalM12M1 + externalM12D ) ) ); //M to end - forwardProb = lnSum( forwardProb, lnProd( I_curr.back(), eln( externalI2M1 ) ) ); //I to end - -#if TEST_HMM -std::cerr << "<-------------------" << std::endl; -std::cerr << useBrdU << std::endl; -std::cerr << scalings.shift << " " << scalings.scale << " " << scalings.var << std::endl; -std::cerr << sequence << std::endl; -for (auto ob = observations.begin(); ob < observations.end(); ob++){ - std::cerr << *ob << " "; -} -std::cerr << std::endl; -std::cerr << forwardProb << std::endl; -#endif - - return forwardProb; -} - -double sequenceProbability_Penthus( std::vector &observations, - std::string &sequence, - size_t windowSize, - bool useBrdU, - PoreParameters scalings, - size_t BrdUStart, - size_t BrdUEnd ){ - - HiddenMarkovModel hmm = HiddenMarkovModel(); - - /*STATES - vector (of vectors) to hold the states at each position on the reference - fill with dummy values */ - std::vector< std::vector< State > > states( 3, std::vector< State >( sequence.length() - 5, State( NULL, "", "", "", 1.0 ) ) ); - - /*DISTRIBUTIONS - vector to hold normal distributions, a single uniform and silent distribution to use for everything else */ - std::vector< NormalDistribution > nd; - nd.reserve( sequence.length() - 5 ); - - SilentDistribution sd( 0.0, 0.0 ); - UniformDistribution ud( 0, 250.0 ); - - std::string loc, sixMer; - - /*create make normal distributions for each reference position using the ONT 6mer model */ - for ( unsigned int i = 0; i < sequence.length() - 5; i++ ){ - - sixMer = sequence.substr( i, 6 ); - - if ( useBrdU and BrdUStart - 5 <= i and i <= BrdUEnd and sixMer.find('T') != std::string::npos and analogueModel.count(sixMer) > 0 ){ - - nd.push_back( NormalDistribution( scalings.shift + scalings.scale * analogueModel.at(sixMer).first, scalings.var * analogueModel.at(sixMer).second ) ); - } - else { - - nd.push_back( NormalDistribution( scalings.shift + scalings.scale * thymidineModel.at(sixMer).first, scalings.var * thymidineModel.at(sixMer).second ) ); - } - } - - /*the first insertion state after start */ - State firstI = State( &ud, "-1_I", "", "", 1.0 ); - hmm.add_state( firstI ); - - /*add states to the model, handle internal module transitions */ - for ( unsigned int i = 0; i < sequence.length() - 5; i++ ){ - - loc = std::to_string( i ); - sixMer = sequence.substr( i, 6 ); - - states[ 0 ][ i ] = State( &sd, loc + "_D", sixMer, "", 1.0 ); - states[ 1 ][ i ] = State( &ud, loc + "_I", sixMer, "", 1.0 ); - states[ 2 ][ i ] = State( &nd[i], loc + "_M1", sixMer, loc + "_match", 1.0 ); - - /*add state to the model */ - for ( unsigned int j = 0; j < 3; j++ ){ - - states[ j ][ i ].meta = sixMer; - hmm.add_state( states[ j ][ i ] ); - } - - /*transitions between states, internal to a single base */ - /*from I */ - hmm.add_transition( states[1][i], states[1][i], internalI2I ); - - /*from M1 */ - hmm.add_transition( states[2][i], states[2][i], internalM12M1 ); - hmm.add_transition( states[2][i], states[1][i], internalM12I ); - } - - /*add transitions between modules (external transitions) */ - for ( unsigned int i = 0; i < sequence.length() - 6; i++ ){ - - /*from D */ - hmm.add_transition( states[0][i], states[0][i + 1], externalD2D ); - hmm.add_transition( states[0][i], states[2][i + 1], externalD2M1 ); - - /*from I */ - hmm.add_transition( states[1][i], states[2][i + 1], externalI2M1 ); - - /*from M */ - hmm.add_transition( states[2][i], states[0][i + 1], externalM12D ); - hmm.add_transition( states[2][i], states[2][i + 1], externalM12M1 ); - } - - /*handle start states */ - hmm.add_transition( hmm.start, firstI, 0.25 ); - hmm.add_transition( hmm.start, states[0][0], 0.25 ); - hmm.add_transition( hmm.start, states[2][0], 0.5 ); - - /*transitions from first insertion */ - hmm.add_transition( firstI, firstI, 0.25 ); - hmm.add_transition( firstI, states[0][0], 0.25 ); - hmm.add_transition( firstI, states[2][0], 0.5 ); - - /*handle end states */ - hmm.add_transition( states[0][sequence.length() - 6], hmm.end, 1.0 ); - hmm.add_transition( states[1][sequence.length() - 6], hmm.end, externalI2M1 ); - hmm.add_transition( states[2][sequence.length() - 6], hmm.end, externalM12M1 + externalM12D ); - - hmm.finalise(); - //std::pair > > dummy = hmm.forward( observations ); - return hmm.sequenceProbability( observations ); -} - -double sequenceProbability_methyl( std::vector &observations, - std::string &sequence, - std::string &sequence_methylated, - size_t windowSize, - PoreParameters scalings, - size_t MethylStart, - size_t MethylEnd ){ - - std::vector< double > I_curr(2*windowSize, NAN), D_curr(2*windowSize, NAN), M_curr(2*windowSize, NAN), I_prev(2*windowSize, NAN), D_prev(2*windowSize, NAN), M_prev(2*windowSize, NAN); - double firstI_curr = NAN, firstI_prev = NAN; - double start_curr = NAN, start_prev = 0.0; - - double matchProb, insProb; - - /*-----------INITIALISATION----------- */ - //transitions from the start state - D_prev[0] = lnProd( start_prev, eln( 0.25 ) ); - - //account for transitions between deletion states before we emit the first observation - for ( unsigned int i = 1; i < D_prev.size(); i++ ){ - - D_prev[i] = lnProd( D_prev[i-1], eln ( externalD2D ) ); - } - - - /*-----------RECURSION----------- */ - /*complexity is O(T*N^2) where T is the number of observations and N is the number of states */ - double level_mu, level_sigma; - for ( unsigned int t = 0; t < observations.size(); t++ ){ - - std::fill( I_curr.begin(), I_curr.end(), NAN ); - std::fill( M_curr.begin(), M_curr.end(), NAN ); - std::fill( D_curr.begin(), D_curr.end(), NAN ); - firstI_curr = NAN; - - std::string sixMer = sequence.substr(0, 6); - std::string sixMer_methyl = sequence_methylated.substr(0,6); - - if ( sixMer_methyl.find('M') != std::string::npos and methyl5mCModel.count(sixMer_methyl) > 0 ){ - - level_mu = scalings.shift + scalings.scale * methyl5mCModel.at(sixMer_methyl).first; - level_sigma = scalings.var * methyl5mCModel.at(sixMer_methyl).second; - } - else { - - level_mu = scalings.shift + scalings.scale * thymidineModel.at(sixMer).first; - level_sigma = scalings.var * thymidineModel.at(sixMer).second; - } - - //uncomment to scale events - //level_mu = thymidineModel.at(sixMer).first; - //level_sigma = scalings.var / scalings.scale * thymidineModel.at(sixMer).second; - //observations[t] = (observations[t] - scalings.shift) / scalings.scale; - - matchProb = eln( normalPDF( level_mu, level_sigma, observations[t] ) ); - insProb = eln( uniformPDF( 0, 250, observations[t] ) ); - - //first insertion - firstI_curr = lnSum( firstI_curr, lnProd( lnProd( start_prev, eln( 0.25 ) ), insProb ) ); //start to first I - firstI_curr = lnSum( firstI_curr, lnProd( lnProd( firstI_prev, eln( 0.25 ) ), insProb ) ); //first I to first I - - //to the base 1 insertion - I_curr[0] = lnSum( I_curr[0], lnProd( lnProd( I_prev[0], eln( internalI2I ) ), insProb ) ); //I to I - I_curr[0] = lnSum( I_curr[0], lnProd( lnProd( M_prev[0], eln( internalM12I ) ), insProb ) ); //M to I - - //to the base 1 match - M_curr[0] = lnSum( M_curr[0], lnProd( lnProd( firstI_prev, eln( 0.5 ) ), matchProb ) ); //first I to first match - M_curr[0] = lnSum( M_curr[0], lnProd( lnProd( M_prev[0], eln( internalM12M1 ) ), matchProb ) ); //M to M - M_curr[0] = lnSum( M_curr[0], lnProd( lnProd( start_prev, eln( 0.5 ) ), matchProb ) ); //start to M - - //to the base 1 deletion - D_curr[0] = lnSum( D_curr[0], lnProd( NAN, eln( 0.25 ) ) ); //start to D - D_curr[0] = lnSum( D_curr[0], lnProd( firstI_curr, eln( 0.25 ) ) ); //first I to first deletion - - //the rest of the sequence - for ( unsigned int i = 1; i < I_curr.size(); i++ ){ - - //get model parameters - sixMer = sequence.substr(i, 6); - sixMer_methyl = sequence_methylated.substr(i,6); - insProb = eln( uniformPDF( 0, 250, observations[t] ) ); - if ( methyl5mCModel.count(sixMer_methyl) > 0 and MethylStart - 5 <= i and i <= MethylEnd ){ - - level_mu = scalings.shift + scalings.scale * methyl5mCModel.at(sixMer_methyl).first; - level_sigma = scalings.var * methyl5mCModel.at(sixMer_methyl).second; - - //uncomment if you scale events - //level_mu = analogueModel.at(sixMer).first; - //level_sigma = scalings.var / scalings.scale * analogueModel.at(sixMer).second; - - matchProb = eln( normalPDF( level_mu, level_sigma, observations[t] ) ); - } - else{ - level_mu = scalings.shift + scalings.scale * thymidineModel.at(sixMer).first; - level_sigma = scalings.var * thymidineModel.at(sixMer).second; - - //uncomment if you scale events - //level_mu = thymidineModel.at(sixMer).first; - //level_sigma = scalings.var / scalings.scale * thymidineModel.at(sixMer).second; - - matchProb = eln( normalPDF( level_mu, level_sigma, observations[t] ) ); - } - - //to the insertion - I_curr[i] = lnSum( I_curr[i], lnProd( lnProd( I_prev[i], eln( internalI2I ) ), insProb ) ); //I to I - I_curr[i] = lnSum( I_curr[i], lnProd( lnProd( M_prev[i], eln( internalM12I ) ), insProb ) ); //M to I - - //to the match - M_curr[i] = lnSum( M_curr[i], lnProd( lnProd( I_prev[i-1], eln( externalI2M1 ) ), matchProb ) ); //external I to M - M_curr[i] = lnSum( M_curr[i], lnProd( lnProd( M_prev[i-1], eln( externalM12M1 ) ), matchProb ) ); //external M to M - M_curr[i] = lnSum( M_curr[i], lnProd( lnProd( M_prev[i], eln( internalM12M1 ) ), matchProb ) ); //interal M to M - M_curr[i] = lnSum( M_curr[i], lnProd( lnProd( D_prev[i-1], eln( externalD2M1 ) ), matchProb ) ); //external D to M - } - - for ( unsigned int i = 1; i < I_curr.size(); i++ ){ - - //to the deletion - D_curr[i] = lnSum( D_curr[i], lnProd( M_curr[i-1], eln( externalM12D ) ) ); //external M to D - D_curr[i] = lnSum( D_curr[i], lnProd( D_curr[i-1], eln( externalD2D ) ) ); //external D to D - } - - I_prev = I_curr; - M_prev = M_curr; - D_prev = D_curr; - firstI_prev = firstI_curr; - start_prev = start_curr; - } - - - /*-----------TERMINATION----------- */ - double forwardProb = NAN; - - forwardProb = lnSum( forwardProb, lnProd( D_curr.back(), eln( 1.0 ) ) ); //D to end - forwardProb = lnSum( forwardProb, lnProd( M_curr.back(), eln( externalM12M1 + externalM12D ) ) ); //M to end - forwardProb = lnSum( forwardProb, lnProd( I_curr.back(), eln( externalI2M1 ) ) ); //I to end - - return forwardProb; -} - - -std::string getQuerySequence( bam1_t *record ){ - //Covered in: tests/detect/htslib - - std::string seq; - uint8_t *a_seq = bam_get_seq(record); - for ( int i = 0; i < record -> core.l_qseq; i++){ - int seqInBase = bam_seqi(a_seq,i); - - switch (seqInBase) { - - case 1: seq += "A"; break; - case 2: seq += "C"; break; - case 4: seq += "G"; break; - case 8: seq += "T"; break; - case 15: seq += "N"; break; - default: throw ParsingError(); - } - } - return seq; -} - - -void getRefEnd(bam1_t *record, int &refStart, int &refEnd ){ - //Covered in: tests/detect/htslib - - //initialise reference coordinates for the first match - refStart = record -> core.pos; - int refPosition = 0; - - const uint32_t *cigar = bam_get_cigar(record); - - if ( bam_is_rev(record) ){ - - for ( int i = record -> core.n_cigar - 1; i >= 0; i--){ - - const int op = bam_cigar_op(cigar[i]); //cigar operation - const int ol = bam_cigar_oplen(cigar[i]); //number of consecutive operations - - //for a match - if (op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF){ - - refPosition += ol; - } - //for a deletion - else if (op == BAM_CDEL or op == BAM_CREF_SKIP){ - - refPosition += ol; - } - //for insertions, advance only the query position so skip - //N.B. hard clipping advances neither refernce nor query, so ignore it - } - } - else { - - for ( unsigned int i = 0; i < record -> core.n_cigar; ++i){ - - const int op = bam_cigar_op(cigar[i]); //cigar operation - const int ol = bam_cigar_oplen(cigar[i]); //number of consecutive operations - - //for a match, advance both reference and query together - if (op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF){ - - refPosition += ol; - } - //for a deletion, advance only the reference position - else if (op == BAM_CDEL or op == BAM_CREF_SKIP){ - - refPosition += ol; - } - //for insertions, advance only the query position so skip - //N.B. hard clipping advances neither refernce nor query, so ignore it - } - } - refEnd = refStart + refPosition; -} - - -void parseCigar(bam1_t *record, std::map< unsigned int, unsigned int > &ref2query, int &refStart, int &refEnd ){ - //Covered in: tests/detect/htslib - - //initialise reference and query coordinates for the first match - refStart = record -> core.pos; - int queryPosition = 0; - int refPosition = 0; - - const uint32_t *cigar = bam_get_cigar(record); - - if ( bam_is_rev(record) ){ - - for ( int i = record -> core.n_cigar - 1; i >= 0; i--){ - - const int op = bam_cigar_op(cigar[i]); //cigar operation - const int ol = bam_cigar_oplen(cigar[i]); //number of consecutive operations - - //for a match, advance both reference and query together - if (op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF){ - - for ( int j = refPosition; j < refPosition + ol; j++ ){ - - ref2query[j] = queryPosition; - queryPosition++; - } - refPosition += ol; - } - //for a deletion, advance only the reference position - else if (op == BAM_CDEL or op == BAM_CREF_SKIP){ - - for ( int j = refPosition; j < refPosition + ol; j++ ){ - - ref2query[j] = queryPosition; - } - refPosition += ol; - } - //for insertions or soft clipping, advance only the query position - else if (op == BAM_CSOFT_CLIP or op == BAM_CINS){ - - for ( int j = refPosition; j < refPosition + ol; j++ ){ - - ref2query[j] = queryPosition; - queryPosition++; - } - } - //N.B. hard clipping advances neither refernce nor query, so ignore it - } - } - else { - - for ( unsigned int i = 0; i < record -> core.n_cigar; ++i){ - - const int op = bam_cigar_op(cigar[i]); //cigar operation - const int ol = bam_cigar_oplen(cigar[i]); //number of consecutive operations - - //for a match, advance both reference and query together - if (op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF){ - - for ( int j = refPosition; j < refPosition + ol; j++ ){ - - ref2query[j] = queryPosition; - queryPosition++; - } - refPosition += ol; - } - //for a deletion, advance only the reference position - else if (op == BAM_CDEL or op == BAM_CREF_SKIP){ - - for ( int j = refPosition; j < refPosition + ol; j++ ){ - - ref2query[j] = queryPosition; - } - refPosition += ol; - } - //for insertions or soft clipping, advance only the query position - else if (op == BAM_CSOFT_CLIP or op == BAM_CINS){ - - for ( int j = refPosition; j < refPosition + ol; j++ ){ - - ref2query[j] = queryPosition; - queryPosition++; - } - } - //N.B. hard clipping advances neither refernce nor query, so ignore it - } - } - refEnd = refStart + refPosition; -} - - -void parseIndex( std::string indexFilename, std::map< std::string, std::string > &readID2path, bool &bulk ){ - - std::cout << "Loading DNAscent index... "; - std::ifstream indexFile( indexFilename ); - if ( not indexFile.is_open() ) throw IOerror( indexFilename ); - std::string line; - - //get whether this is bulk fast5 or individual fast5 from the index - std::getline( indexFile, line); - if (line == "#bulk") bulk = true; - else if (line == "#individual") bulk = false; - else throw IndexFormatting(); - - //get the readID to path map - while ( std::getline( indexFile, line) ){ - - std::string readID = line.substr(0, line.find('\t')); - std::string path = line.substr(line.find('\t')+1); - readID2path[readID] = path; - } - std::cout << "ok." << std::endl; -} - -void countRecords( htsFile *bam_fh, hts_idx_t *bam_idx, bam_hdr_t *bam_hdr, int &numOfRecords, int minQ, int minL ){ - - std::cout << "Scanning bam file..."; - hts_itr_t* itr = sam_itr_querys(bam_idx,bam_hdr,"."); - int result; - - do { - bam1_t *record = bam_init1(); - result = sam_itr_next(bam_fh, itr, record); - int refStart,refEnd; - getRefEnd(record,refStart,refEnd); - if ( (record -> core.qual >= minQ) and (refEnd - refStart >= minL) ) numOfRecords++; - bam_destroy1(record); - } while (result > 0); - - //cleanup - sam_itr_destroy(itr); - std::cout << "ok." << std::endl; -} - - -std::vector< unsigned int > getPOIs( std::string &refSeq, int windowLength ){ - - std::vector< unsigned int > POIs; - - for ( unsigned int i = 2*windowLength; i < refSeq.length() - 2*windowLength; i++ ){ - - if (refSeq.substr(i,1) == "T") POIs.push_back(i); - } - return POIs; -} - - -std::string methylateSequence( std::string &inSeq ){ - - std::string outSeq = inSeq; - - for ( unsigned int i = 0; i < inSeq.size(); i++ ){ - - //CpG - if ( inSeq.substr(i,2) == "CG" ) outSeq.replace(i,1,"M"); - - //GpC - //if ( inSeq.substr(i,2) == "GC" ) outSeq.replace(i+1,1,"M"); - - //Dam methylation (methyl-adenine in GATC) - //if ( inSeq.substr(i,4) == "GATC" ) outSeq.replace(i+1,1,"M"); - - //Dcm methylation (methyl-cytosine second cytonsine of CCAGG and CCTGG) - //if ( inSeq.substr(i,5) == "CCAGG" ) outSeq.replace(i+1,1,"M"); - //if ( inSeq.substr(i,5) == "CCTGG" ) outSeq.replace(i+1,1,"M"); - } - return outSeq; -} - - -std::string llAcrossRead( read &r, - unsigned int windowLength, - int &failedEvents, - bool methylAware ){ - - std::string out; - //get the positions on the reference subsequence where we could attempt to make a call - std::vector< unsigned int > POIs = getPOIs( r.referenceSeqMappedTo, windowLength ); - std::string strand; - unsigned int readHead = 0; - if ( r.isReverse ){ - - strand = "rev"; - readHead = (r.eventAlignment).size() - 1; - std::reverse( POIs.begin(), POIs.end() ); - } - else{ - - strand = "fwd"; - readHead = 0; - } - - out += ">" + r.readID + " " + r.referenceMappedTo + " " + std::to_string(r.refStart) + " " + std::to_string(r.refEnd) + " " + strand + "\n"; - - for ( unsigned int i = 0; i < POIs.size(); i++ ){ - - int posOnRef = POIs[i]; - int posOnQuery = (r.refToQuery).at(posOnRef); - - std::string readSnippet = (r.referenceSeqMappedTo).substr(posOnRef - windowLength, 2*windowLength+6); - - //make sure the read snippet is fully defined as A/T/G/C in reference - unsigned int As = 0, Ts = 0, Cs = 0, Gs = 0; - for ( std::string::iterator i = readSnippet.begin(); i < readSnippet.end(); i++ ){ - - switch( *i ){ - case 'A' : - As++; - break; - case 'T' : - Ts++; - break; - case 'G' : - Gs++; - break; - case 'C' : - Cs++; - break; - } - } - if ( readSnippet.length() != (As + Ts + Gs + Cs) ) continue; - - std::vector< double > eventSnippet; - - //catch spans with lots of insertions or deletions (this QC was set using results of tests/detect/hmm_falsePositives) - int spanOnQuery = (r.refToQuery)[posOnRef + windowLength+6] - (r.refToQuery)[posOnRef - windowLength]; - if ( spanOnQuery > 3.5*windowLength or spanOnQuery < 2*windowLength ) continue; - - /*get the events that correspond to the read snippet */ - bool first = true; - if ( r.isReverse ){ - - for ( unsigned int j = readHead; j >= 0; j-- ){ - - /*if an event has been aligned to a position in the window, add it */ - if ( (r.eventAlignment)[j].second >= (r.refToQuery)[posOnRef - windowLength] and (r.eventAlignment)[j].second < (r.refToQuery)[posOnRef + windowLength] ){ - - if (first){ - readHead = j; - first = false; - //std::cout << "READHEAD:" << j << " " << readHead << std::endl; - } - - double ev = (r.normalisedEvents)[(r.eventAlignment)[j].first]; - if (ev > 0 and ev < 250){ - eventSnippet.push_back( ev ); - } - else{ - - failedEvents++; - } - } - - /*stop once we get to the end of the window */ - if ( (r.eventAlignment)[j].second < (r.refToQuery)[posOnRef - windowLength] ){ - - std::reverse(eventSnippet.begin(), eventSnippet.end()); - break; - } - } - } - else{ - for ( unsigned int j = readHead; j < (r.eventAlignment).size(); j++ ){ - - /*if an event has been aligned to a position in the window, add it */ - if ( (r.eventAlignment)[j].second >= (r.refToQuery)[posOnRef - windowLength] and (r.eventAlignment)[j].second < (r.refToQuery)[posOnRef + windowLength] ){ - - if (first){ - readHead = j; - first = false; - //std::cout << "READHEAD:" << j << " " << readHead << std::endl; - } - - double ev = (r.normalisedEvents)[(r.eventAlignment)[j].first]; - if (ev > 0 and ev < 250){ - eventSnippet.push_back( ev ); - } - else{ - - failedEvents++; - } - } - - /*stop once we get to the end of the window */ - if ( (r.eventAlignment)[j].second >= (r.refToQuery)[posOnRef + windowLength] ) break; - } - } - - //catch abnormally few or many events (this QC was set using results of tests/detect/hmm_falsePositives) - if ( eventSnippet.size() > 8*windowLength or eventSnippet.size() < 3.5*windowLength ) continue; - - /* - TESTING - print out the read snippet, the ONT model, and the aligned events - std::cout << readSnippet << std::endl; - for ( int pos = 0; pos < readSnippet.length()-5; pos++ ){ - - std::cout << readSnippet.substr(pos,6) << "\t" << thymidineModel.at( readSnippet.substr(pos,6) ).first << std::endl; - } - for ( auto ev = eventSnippet.begin(); ev < eventSnippet.end(); ev++){ - double scaledEv = (*ev - r.scalings.shift) / r.scalings.scale; - std::cout << scaledEv << std::endl; - } - */ - - //calculate where we are on the assembly - if we're a reverse complement, we're moving backwards down the reference genome - int globalPosOnRef; - std::string sixMerQuery = (r.basecall).substr(posOnQuery, 6); - std::string sixMerRef = (r.referenceSeqMappedTo).substr(posOnRef, 6); - if ( r.isReverse ){ - - globalPosOnRef = r.refEnd - posOnRef - 6; - sixMerQuery = reverseComplement( sixMerQuery ); - sixMerRef = reverseComplement( sixMerRef ); - } - else{ - - globalPosOnRef = r.refStart + posOnRef; - } - - //make the BrdU call - std::string sixOI = (r.referenceSeqMappedTo).substr(posOnRef,6); - size_t BrdUStart = sixOI.find('T') + windowLength; - size_t BrdUEnd = sixOI.rfind('T') + windowLength; - double logProbAnalogue = sequenceProbability( eventSnippet, readSnippet, windowLength, true, r.scalings, BrdUStart, BrdUEnd ); - double logProbThymidine = sequenceProbability( eventSnippet, readSnippet, windowLength, false, r.scalings, 0, 0 ); - double logLikelihoodRatio = logProbAnalogue - logProbThymidine; - //test hard HMM implementation against Penthus implementation - //double logProbThymidine_Penthus = sequenceProbability_Penthus( eventSnippet, readSnippet, windowLength, false, r.scalings, BrdUStart, BrdUEnd ); - //std::cerr << ">>>>" << logProbThymidine_Penthus << std::endl; - -#if TEST_LL -std::cerr << "<-------------------" << std::endl; -std::cerr << spanOnQuery << std::endl; -std::cerr << readSnippet << std::endl; -for (auto ob = eventSnippet.begin(); ob < eventSnippet.end(); ob++){ - std::cerr << *ob << " "; -} -std::cerr << std::endl; -std::cerr << logLikelihoodRatio << std::endl; -#endif - - if ( methylAware) { - - std::string readSnippetMethylated = methylateSequence( readSnippet ); - std::string conflictSubseq = readSnippetMethylated.substr(BrdUStart-5,BrdUEnd+11-BrdUStart); - - if (conflictSubseq.find("M") == std::string::npos){ - - out += std::to_string(globalPosOnRef) + "\t" + std::to_string(logLikelihoodRatio) + "\t" + sixMerRef + "\t" + sixMerQuery + "\n"; - } - else{ - - size_t MethylStart = conflictSubseq.find('M') + BrdUStart-5; - size_t MethylEnd = conflictSubseq.rfind('M') + BrdUStart-5; - - double logProbMethylated = sequenceProbability_methyl( eventSnippet, readSnippet, readSnippetMethylated, windowLength, r.scalings, MethylStart, MethylEnd ); - double logLikelihood_BrdUvsMethyl = logProbAnalogue - logProbMethylated; - double logLikelihood_MethylvsThym = logProbMethylated - logProbThymidine; - out += std::to_string(globalPosOnRef) + "\t" + std::to_string(logLikelihoodRatio) + "\t" + std::to_string(logLikelihood_BrdUvsMethyl) + "\t" + std::to_string(logLikelihood_MethylvsThym) + "\t" + sixMerRef + "\t" + sixMerQuery + "\n"; - } - } - else{ - - out += std::to_string(globalPosOnRef) + "\t" + std::to_string(logLikelihoodRatio) + "\t" + sixMerRef + "\t" + sixMerQuery + "\n"; - } - } - return out; -} - - -int detect_main( int argc, char** argv ){ - - Arguments args = parseDetectArguments( argc, argv ); - bool bulkFast5; - - //load DNAscent index - std::map< std::string, std::string > readID2path; - parseIndex( args.indexFilename, readID2path, bulkFast5 ); - - //import fasta reference - std::map< std::string, std::string > reference = import_reference_pfasta( args.referenceFilename ); - - std::ofstream outFile( args.outputFilename ); - if ( not outFile.is_open() ) throw IOerror( args.outputFilename ); - - htsFile* bam_fh; - hts_idx_t* bam_idx; - bam_hdr_t* bam_hdr; - hts_itr_t* itr; - - //load the bam - std::cout << "Opening bam file... "; - bam_fh = sam_open((args.bamFilename).c_str(), "r"); - if (bam_fh == NULL) throw IOerror(args.bamFilename); - - //load the index - bam_idx = sam_index_load(bam_fh, (args.bamFilename).c_str()); - if (bam_idx == NULL) throw IOerror("index for "+args.bamFilename); - - //load the header - bam_hdr = sam_hdr_read(bam_fh); - std::cout << "ok." << std::endl; - - /*initialise progress */ - int numOfRecords = 0, prog = 0, failed = 0; - countRecords( bam_fh, bam_idx, bam_hdr, numOfRecords, args.minQ, args.minL ); - progressBar pb(numOfRecords,true); - - //build an iterator for all reads in the bam file - const char *allReads = "."; - itr = sam_itr_querys(bam_idx,bam_hdr,allReads); - - unsigned int windowLength = 10; - int result; - int failedEvents = 0; - unsigned int maxBufferSize; - std::vector< bam1_t * > buffer; - if ( args.threads <= 4 ) maxBufferSize = args.threads; - else maxBufferSize = 4*(args.threads); - - do { - //initialise the record and get the record from the file iterator - bam1_t *record = bam_init1(); - result = sam_itr_next(bam_fh, itr, record); - - //add the record to the buffer if it passes the user's criteria, otherwise destroy it cleanly - int mappingQual = record -> core.qual; - int refStart,refEnd; - getRefEnd(record,refStart,refEnd); - if ( mappingQual >= args.minQ and refEnd - refStart >= args.minL ){ - buffer.push_back( record ); - } - else{ - bam_destroy1(record); - } - - /*if we've filled up the buffer with short reads, compute them in parallel */ - if (buffer.size() >= maxBufferSize or (buffer.size() > 0 and result == -1 ) ){ - - #pragma omp parallel for schedule(dynamic) shared(buffer,windowLength,analogueModel,thymidineModel,methyl5mCModel,args,prog,failed) num_threads(args.threads) - for (unsigned int i = 0; i < buffer.size(); i++){ - - read r; - - //get the read name (which will be the ONT readID from Albacore basecall) - const char *queryName = bam_get_qname(buffer[i]); - if (queryName == NULL) continue; - std::string s_queryName(queryName); - r.readID = s_queryName; - - //iterate on the cigar string to fill up the reference-to-query coordinate map - parseCigar(buffer[i], r.refToQuery, r.refStart, r.refEnd); - - //get the name of the reference mapped to - std::string mappedTo(bam_hdr -> target_name[buffer[i] -> core.tid]); - r.referenceMappedTo = mappedTo; - - //open fast5 and normalise events to pA - r.filename = readID2path[s_queryName]; - - try{ - - if (bulkFast5) bulk_getEvents(r.filename, r.readID, r.raw); - else getEvents( r.filename, r.raw ); - } - catch ( BadFast5Field &bf5 ){ - - failed++; - prog++; - continue; - } - /*get the subsequence of the reference this read mapped to */ - r.referenceSeqMappedTo = reference.at(r.referenceMappedTo).substr(r.refStart, r.refEnd - r.refStart); - - //fetch the basecall from the bam file - r.basecall = getQuerySequence(buffer[i]); - - //account for reverse complements - if ( bam_is_rev(buffer[i]) ){ - - r.basecall = reverseComplement( r.basecall ); - r.referenceSeqMappedTo = reverseComplement( r.referenceSeqMappedTo ); - r.isReverse = true; - } - - normaliseEvents(r); - - //catch reads with rough event alignments that fail the QC - if ( r.eventAlignment.size() == 0 ){ - - failed++; - prog++; - continue; - } - - std::string readOut = llAcrossRead(r, windowLength, failedEvents, args.methylAware); - - #pragma omp critical - { - outFile << readOut; - prog++; - pb.displayProgress( prog, failed, failedEvents ); - - if (args.testAlignment){ - std::cerr << ">" << r.readID << std::endl; - for ( auto p_align = r.eventAlignment.begin(); p_align < r.eventAlignment.end(); p_align++ ){ - - std::cerr<< p_align -> first << " " << p_align -> second << std::endl; - } - r.alignmentQCs.printQCs(); - r.printScalings(); - } - } - } - for ( unsigned int i = 0; i < buffer.size(); i++ ) bam_destroy1(buffer[i]); - buffer.clear(); - } - pb.displayProgress( prog, failed, failedEvents ); - } while (result > 0); - sam_itr_destroy(itr); - std::cout << std::endl; - return 0; -} diff --git a/tests/detect/hmm_forward/testHMMForward.py b/tests/detect/hmm_forward/testHMMForward.py deleted file mode 100644 index 61b89fa..0000000 --- a/tests/detect/hmm_forward/testHMMForward.py +++ /dev/null @@ -1,225 +0,0 @@ -from pomegranate import * - -#Usage: python testHMMForward.py DNAscentDetect.stderr -#where DNAscentDetect.stderr is from stderr after running DNAscent detect with #define TEST_HMM 1 - -def import_poreModel(filename): -# takes the filename of an ONT pore model file and returns a map from kmer (string) to [mean,std] (list of floats) -# ARGUMENTS -# --------- -# - filename: path to an ONT model file -# type: string -# OUTPUTS -# ------- -# - kmer2MeanStd: a map, keyed by a kmer, that returns the model mean and standard deviation signal for that kmer -# type: dictionary - - f = open(filename,'r') - g = f.readlines() - f.close() - - kmer2MeanStd = {} - for line in g: - if line[0] != '#' and line[0:4] != 'kmer': #ignore the header - splitLine = line.split('\t') - kmer2MeanStd[ splitLine[0] ] = [ float(splitLine[1]), float(splitLine[2]) ] - g = None - - return kmer2MeanStd - - -def build_TrainingHMM(refSequence, thymidineModel, brduModel, scalings, useBrdU): - - hmm = HiddenMarkovModel() - - windowSize = 10 - brduStart = windowSize - 5 - brduEnd = windowSize# + refSequence[windowSize:windowSize+6].rfind('T') - - refLength = len(refSequence) - - #new HMM transition parameters - internalM12I = 0.3475 - internalI2I = 0.5 - internalM12M1 = 0.4 - - externalD2D = 0.3 - externalD2M1 = 0.7 - externalI2M1 = 0.5 - externalM12D = 0.0025 - externalM12M1 = 0.25 - - ########################################################################################################################### - # Add States to Model - #Create the HMM states. Iterate through the reference sequence, and make the repeating HMM module for each position in the sequence. - - #two-dimensional array for the states, where the columns are the positions in the reference - states = [[0 for x in range(refLength)] for y in range(3)] - - #the first base - i = 0 - emissions = thymidineModel[refSequence[0:6]] - level_mu = scalings[0] + scalings[1] * emissions[0] - level_sig = scalings[2] * emissions[1] - - #print(refSequence[i:i+6]) - #print(level_mu,level_sig) - - states[0][i] = State( UniformDistribution(0, 250, frozen=True), name='Insertion_'+str(i) ) - states[1][i] = State( NormalDistribution(level_mu, level_sig), name='Match_'+str(i) ) - states[2][i] = State( None, name='Deletion_'+str(i) ) - for j in range(3): - hmm.add_state(states[j][i]) - - #make an insertion state before the first base - firstI = State( UniformDistribution(0, 250, frozen=True), name='Insertion_Pre' ) - hmm.add_state(firstI) - - #insertion state before the first base - hmm.add_transition(hmm.start, firstI, 0.25) #start to the first insertion - hmm.add_transition(firstI, firstI, 0.25) #self loop - - #to the base 1 insertion - hmm.add_transition(states[0][i], states[0][i], internalI2I , group='internal_I-to-I') - hmm.add_transition(states[1][i], states[0][i], internalM12I , group='internal_M-to-I') - - #to the base 1 match - hmm.add_transition(firstI, states[1][i], 0.5) #first insertion to first match - hmm.add_transition(states[1][i], states[1][i], internalM12M1 , group='internal_M-to-M') - hmm.add_transition(hmm.start, states[1][0], 0.5) #start to M - - #to the base 1 deletion - hmm.add_transition(firstI, states[2][i], 0.25) #first insertion to the first deletion - hmm.add_transition(hmm.start, states[2][0], 0.25) #start to D - - #the rest of the sequence - for i, char in enumerate(refSequence[1:-5]): - - i += 1 - - if (useBrdU and brduStart <= i and i <= brduEnd): - emissions = brduModel[refSequence[i:i+6]] - else: - emissions = thymidineModel[refSequence[i:i+6]] - - #correct for shift/scale/var - level_mu = scalings[0] + scalings[1] * emissions[0] - level_sig = scalings[2] * emissions[1] - - #print(refSequence[i:i+6]) - #print(level_mu,level_sig) - - #create states for this nucleotide - states[0][i] = State( UniformDistribution(0, 250, frozen=True), name='Insertion_'+str(i) ) - states[1][i] = State( NormalDistribution(level_mu, level_sig), name='Match_'+str(i) ) - states[2][i] = State( None, name='Deletion_'+str(i) ) - - #add the state objects to the hmm model object - for j in range(3): - hmm.add_state(states[j][i]) - - #internal transitions for this nucleotide - hmm.add_transition(states[1][i], states[1][i], internalM12M1 , group='internal_M-to-M') - hmm.add_transition(states[1][i], states[0][i], internalM12I , group='internal_M-to-I') - hmm.add_transition(states[0][i], states[0][i], internalI2I , group='internal_I-to-I') - - #this is really just a safety thing... get the last index in the iterator - for last_i,x in enumerate(refSequence[:-5]): - pass - - ########################################################################################################################### - # Handle Transitions Between Modules, Handle Analogue Branch - - #We have to reach forward to the next state (state i+1) so it's easier to just do this in a separate loop from the internal transitions one - for i, char in enumerate(refSequence[:-5]): - - #Don't execute this if we're at the end, because there's no i+1 to reach forward to. - if i != last_i: - - hmm.add_transition(states[1][i], states[2][i+1], externalM12D, group='external_M-to-D') - hmm.add_transition(states[2][i], states[2][i+1], externalD2D, group='external_D-to-D') - hmm.add_transition(states[2][i], states[1][i+1], externalD2M1, group='external_D-to-M') - hmm.add_transition(states[1][i], states[1][i+1], externalM12M1, group='external_M-to-M') - hmm.add_transition(states[0][i], states[1][i+1], externalI2M1, group='external_I-to-M') - - ########################################################################################################################### - # Handle Start and End - - #handle end states - hmm.add_transition(states[0][last_i], hmm.end, externalI2M1 ) - hmm.add_transition(states[1][last_i], hmm.end, externalM12M1 + externalM12D) - hmm.add_transition(states[2][last_i], hmm.end, 1.0) - - #bake the model - #hmm.bake(merge='all',verbose=True) - hmm.bake() - return hmm - -########################################################################################################################### -# MAIN - -print('Loading pore models...') -f_thymidineModel = '/home/mb915/rds/hpc-work/development/DNAscent_dev/pore_models/template_median68pA.6mer.model' -thymidineModel = import_poreModel(f_thymidineModel) -f_brduModel = '/home/mb915/rds/hpc-work/development/DNAscent_dev/pore_models/BrdU.model' -brduModel = import_poreModel(f_brduModel) -print('Done.') - -#scalings: shift,scale,var - -f = open(sys.argv[1],'r') -idx = 0 -for line in f: - idx += 1 - if line[0] == '<': - idx = 0 - if idx == 1: - useBrdU = int(line.rstrip()) - elif idx == 2: - scalings = line.rstrip().split() - scalings = [float(i) for i in scalings] - elif idx == 3: - sequence = line.rstrip() - elif idx == 4: - events = line.rstrip().split() - events = [float(i) for i in events] - elif idx == 5: - hmm = build_TrainingHMM(sequence, thymidineModel, brduModel, scalings, useBrdU) - vitOut = hmm.viterbi(events) - viterbiStates = vitOut[1] - names = [] - insertions = 0 - deletions = 0 - for s in viterbiStates: - names.append(s[1].name) - if 'Insertion' in s[1].name: - insertions += 1 - elif 'Deletion' in s[1].name: - deletions += 1 - if useBrdU: - DNAscentProb_BrdU = float(line.rstrip()) - pomegranateProb_BrdU = hmm.log_probability(events) - namesBrdU = names - insertions_BrdU = insertions - deletions_BrdU = deletions - print(useBrdU, pomegranateProb_BrdU, DNAscentProb_BrdU) - else: - DNAscentProb_thym = float(line.rstrip()) - pomegranateProb_thym = hmm.log_probability(events) - namesThym = names - insertions_Thym = insertions - deletions_Thym = deletions - print(useBrdU, pomegranateProb_thym, DNAscentProb_thym) - ''' - print('------------------------------------------') - print('Log Likelihood Ratio: ',DNAscentProb_BrdU-DNAscentProb_thym) - print('Thymidine Insertions:',insertions_Thym) - print('Thymidine Deletions:',deletions_Thym) - print('BrdU Insertions:',insertions_BrdU) - print('BrdU Deletions:',deletions_BrdU) - print('Thym Path:',namesThym) - print('BrdU Path:',namesBrdU) - print('------------------------------------------') - ''' -f.close() - diff --git a/tests/detect/hmm_forward/testHMMProbeViterbi.py b/tests/detect/hmm_forward/testHMMProbeViterbi.py deleted file mode 100644 index eccea80..0000000 --- a/tests/detect/hmm_forward/testHMMProbeViterbi.py +++ /dev/null @@ -1,286 +0,0 @@ -from pomegranate import * -import matplotlib -matplotlib.use('Agg') -from matplotlib import pyplot as plt - -#Usage: python testHMMForward.py DNAscentDetect.stderr -#where DNAscentDetect.stderr is from stderr after running DNAscent detect with #define TEST_HMM 1 - -def import_poreModel(filename): -# takes the filename of an ONT pore model file and returns a map from kmer (string) to [mean,std] (list of floats) -# ARGUMENTS -# --------- -# - filename: path to an ONT model file -# type: string -# OUTPUTS -# ------- -# - kmer2MeanStd: a map, keyed by a kmer, that returns the model mean and standard deviation signal for that kmer -# type: dictionary - - f = open(filename,'r') - g = f.readlines() - f.close() - - kmer2MeanStd = {} - for line in g: - if line[0] != '#' and line[0:4] != 'kmer': #ignore the header - splitLine = line.split('\t') - kmer2MeanStd[ splitLine[0] ] = [ float(splitLine[1]), float(splitLine[2]) ] - g = None - - return kmer2MeanStd - - -def build_TrainingHMM(refSequence, thymidineModel, brduModel, scalings, useBrdU): - - hmm = HiddenMarkovModel() - - windowSize = 10 - brduStart = windowSize - 5 - brduEnd = windowSize + refSequence[windowSize:windowSize+6].rfind('T') - - refLength = len(refSequence) - - #new HMM transition parameters - internalM12I = 0.3475 - internalI2I = 0.5 - internalM12M1 = 0.4 - - externalD2D = 0.3 - externalD2M1 = 0.7 - externalI2M1 = 0.5 - externalM12D = 0.0025 - externalM12M1 = 0.25 - - ########################################################################################################################### - # Add States to Model - #Create the HMM states. Iterate through the reference sequence, and make the repeating HMM module for each position in the sequence. - - #two-dimensional array for the states, where the columns are the positions in the reference - states = [[0 for x in range(refLength)] for y in range(3)] - - #the first base - i = 0 - emissions = thymidineModel[refSequence[0:6]] - level_mu = scalings[0] + scalings[1] * emissions[0] - level_sig = scalings[2] * emissions[1] - - #print(refSequence[i:i+6]) - #print(level_mu,level_sig) - - states[0][i] = State( UniformDistribution(0, 250, frozen=True), name='Insertion_'+str(i) ) - states[1][i] = State( NormalDistribution(level_mu, level_sig), name='Match_'+str(i) ) - states[2][i] = State( None, name='Deletion_'+str(i) ) - for j in range(3): - hmm.add_state(states[j][i]) - - #make an insertion state before the first base - firstI = State( UniformDistribution(0, 250, frozen=True), name='Insertion_Pre' ) - hmm.add_state(firstI) - - #insertion state before the first base - hmm.add_transition(hmm.start, firstI, 0.25) #start to the first insertion - hmm.add_transition(firstI, firstI, 0.25) #self loop - - #to the base 1 insertion - hmm.add_transition(states[0][i], states[0][i], internalI2I , group='internal_I-to-I') - hmm.add_transition(states[1][i], states[0][i], internalM12I , group='internal_M-to-I') - - #to the base 1 match - hmm.add_transition(firstI, states[1][i], 0.5) #first insertion to first match - hmm.add_transition(states[1][i], states[1][i], internalM12M1 , group='internal_M-to-M') - hmm.add_transition(hmm.start, states[1][0], 0.5) #start to M - - #to the base 1 deletion - hmm.add_transition(firstI, states[2][i], 0.25) #first insertion to the first deletion - hmm.add_transition(hmm.start, states[2][0], 0.25) #start to D - - #the rest of the sequence - for i, char in enumerate(refSequence[1:-5]): - - i += 1 - - if (useBrdU and brduStart <= i and i <= brduEnd): - emissions = brduModel[refSequence[i:i+6]] - else: - emissions = thymidineModel[refSequence[i:i+6]] - - #correct for shift/scale/var - level_mu = scalings[0] + scalings[1] * emissions[0] - level_sig = scalings[2] * emissions[1] - - #print(refSequence[i:i+6]) - #print(level_mu,level_sig) - - #create states for this nucleotide - states[0][i] = State( UniformDistribution(0, 250, frozen=True), name='Insertion_'+str(i) ) - states[1][i] = State( NormalDistribution(level_mu, level_sig), name='Match_'+str(i) ) - states[2][i] = State( None, name='Deletion_'+str(i) ) - - #add the state objects to the hmm model object - for j in range(3): - hmm.add_state(states[j][i]) - - #internal transitions for this nucleotide - hmm.add_transition(states[1][i], states[1][i], internalM12M1 , group='internal_M-to-M') - hmm.add_transition(states[1][i], states[0][i], internalM12I , group='internal_M-to-I') - hmm.add_transition(states[0][i], states[0][i], internalI2I , group='internal_I-to-I') - - #this is really just a safety thing... get the last index in the iterator - for last_i,x in enumerate(refSequence[:-5]): - pass - - ########################################################################################################################### - # Handle Transitions Between Modules, Handle Analogue Branch - - #We have to reach forward to the next state (state i+1) so it's easier to just do this in a separate loop from the internal transitions one - for i, char in enumerate(refSequence[:-5]): - - #Don't execute this if we're at the end, because there's no i+1 to reach forward to. - if i != last_i: - - hmm.add_transition(states[1][i], states[2][i+1], externalM12D, group='external_M-to-D') - hmm.add_transition(states[2][i], states[2][i+1], externalD2D, group='external_D-to-D') - hmm.add_transition(states[2][i], states[1][i+1], externalD2M1, group='external_D-to-M') - hmm.add_transition(states[1][i], states[1][i+1], externalM12M1, group='external_M-to-M') - hmm.add_transition(states[0][i], states[1][i+1], externalI2M1, group='external_I-to-M') - - ########################################################################################################################### - # Handle Start and End - - #handle end states - hmm.add_transition(states[0][last_i], hmm.end, externalI2M1 ) - hmm.add_transition(states[1][last_i], hmm.end, externalM12M1 + externalM12D) - hmm.add_transition(states[2][last_i], hmm.end, 1.0) - - #bake the model - #hmm.bake(merge='all',verbose=True) - hmm.bake() - return hmm - -########################################################################################################################### -# MAIN - -verbose = False - -print('Loading pore models...') -f_thymidineModel = '/home/mb915/rds/hpc-work/development/DNAscent_dev/pore_models/template_median68pA.6mer.model' -thymidineModel = import_poreModel(f_thymidineModel) -f_brduModel = '/home/mb915/rds/hpc-work/development/DNAscent_dev/pore_models/BrdU.model' -brduModel = import_poreModel(f_brduModel) -print('Done.') - -#scalings: shift,scale,var - -allRatios = [] -brduProb = [] -thymProb = [] - -brduDel = [] -brduIns = [] -thymDel = [] -thymIns = [] - -maxEvents = 10000 - -f = open(sys.argv[1],'r') -idx = 0 -eventsRead = 0 -for line in f: - idx += 1 - if line[0] == '<': - idx = 0 - if idx == 1: - useBrdU = int(line.rstrip()) - elif idx == 2: - scalings = line.rstrip().split() - scalings = [float(i) for i in scalings] - elif idx == 3: - sequence = line.rstrip() - elif idx == 4: - events = line.rstrip().split() - events = [float(i) for i in events] - elif idx == 5: - eventsRead += 1 - hmm = build_TrainingHMM(sequence, thymidineModel, brduModel, scalings, useBrdU) - vitOut = hmm.viterbi(events) - viterbiStates = vitOut[1] - names = [] - insertions = 0 - deletions = 0 - for s in viterbiStates: - names.append(s[1].name) - if 'Insertion' in s[1].name: - insertions += 1 - elif 'Deletion' in s[1].name: - deletions += 1 - if useBrdU: - DNAscentProb_BrdU = float(line.rstrip()) - pomegranateProb_BrdU = hmm.log_probability(events) - namesBrdU = names - insertions_BrdU = insertions - deletions_BrdU = deletions - else: - DNAscentProb_thym = float(line.rstrip()) - pomegranateProb_thym = hmm.log_probability(events) - namesThym = names - insertions_Thym = insertions - deletions_Thym = deletions - if verbose: - print('------------------------------------------') - print('Log Likelihood Ratio: ',DNAscentProb_BrdU-DNAscentProb_thym) - print('Thymidine Insertions:',insertions_Thym) - print('Thymidine Deletions:',deletions_Thym) - print('BrdU Insertions:',insertions_BrdU) - print('BrdU Deletions:',deletions_BrdU) - print('Thym Path:',namesThym) - print('BrdU Path:',namesBrdU) - print('------------------------------------------') - - allRatios.append(DNAscentProb_BrdU-DNAscentProb_thym) - brduProb.append(DNAscentProb_BrdU) - thymProb.append(DNAscentProb_thym) - - brduIns.append(insertions_BrdU) - brduDel.append(deletions_BrdU) - thymIns.append(insertions_Thym) - thymDel.append(deletions_Thym) - - - if eventsRead % 1000: - print(eventsRead/float(maxEvents)) - - if eventsRead > maxEvents: - break - -f.close() - - -plt.figure() -plt.scatter(allRatios,brduProb,alpha=0.3) -plt.scatter(allRatios,thymProb,alpha=0.3) -plt.legend(['BrdU HMM','Thymidine HMM']) -plt.ylabel('Log Likelihood') -plt.xlabel('Log Likelihood Ratio (BrdU-to-Thym)') -plt.savefig('test_llRatios_Probability.pdf') -plt.close() - -plt.figure() -plt.scatter(allRatios,brduIns,alpha=0.3) -plt.scatter(allRatios,thymIns,alpha=0.3) -plt.legend(['BrdU HMM','Thymidine HMM']) -plt.ylabel('Number of Viterbi Insertions') -plt.xlabel('Log Likelihood Ratio (BrdU-to-Thym)') -plt.savefig('test_llRatios_Insertions.pdf') -plt.close() - -plt.figure() -plt.scatter(allRatios,brduDel,alpha=0.3) -plt.scatter(allRatios,thymDel,alpha=0.3) -plt.legend(['BrdU HMM','Thymidine HMM']) -plt.ylabel('Number of Viterbi Deletions') -plt.xlabel('Log Likelihood Ratio (BrdU-to-Thym)') -plt.savefig('test_llRatios_Deletions.pdf') -plt.close() - - diff --git a/tests/detect/htslib/Makefile b/tests/detect/htslib/Makefile deleted file mode 100644 index dc29548..0000000 --- a/tests/detect/htslib/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -CC = gcc -CXX = g++ -DEBUG = -g -LIBFLAGS = -CXXFLAGS = -Wall -O2 -fopenmp -std=c++14 $(DEBUG) -CFLAGS = -Wall -std=c99 -O2 $(DEBUG) - -#hdf5 -H5_LIB = ../../../hdf5-1.8.14/hdf5/lib/libhdf5.a -H5_INCLUDE = -I./../../../hdf5-1.8.14/hdf5/include -LIBFLAGS += -Wl,-rpath,../../../hdf5-1.8.14/hdf5/lib -L ../../../hdf5-1.8.14/hdf5/lib -lhdf5 - -#hts -HTS_LIB = ../../../htslib/libhts.a -HTS_INCLUDE = -I./../../../htslib -LIBFLAGS += -Wl,-rpath,../../../htslib -L ../../../htslib/ -lhts - -#fast5 -FAST5_INCLUDE = -I./../../../fast5/include - -#add include flags for each library -CXXFLAGS += $(H5_INCLUDE) $(HTS_INCLUDE) $(FAST5_INCLUDE) - -MAIN_EXECUTABLE = test_htslib_interface -EXC_SRC = test_htslib_interface.cpp - -#compile the test executable -$(MAIN_EXECUTABLE): $(HTS_LIB) $(H5_LIB) $(EXC_SRC) - $(CXX) -o $@ $(CXXFLAGS) $(EXC_SRC) ../../../src/data_IO.o ../../../src/pfasta/pfasta.o $(LIBFLAGS) - -clean: - rm $(MAIN_EXECUTABLE) \ No newline at end of file diff --git a/tests/detect/htslib/README.md b/tests/detect/htslib/README.md deleted file mode 100644 index 3676d1f..0000000 --- a/tests/detect/htslib/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# DNAscent Test - htslib Interface - -This test checks DNAscent's interface with htslib in order to parse and iterate on bam files, make sure we're getting genome coordinates right, and make sure we're handling reverse complements correctly. - -## Files - -`reference.fasta` contains some simple sequences and `reads.fasta` shows different insertions, deletions, mismatches, and inversions of these sequences. `alignments.sorted.bam` is the result of aligning these reads to the reference with minimap2 and then sorting the bam file. - -## Running - -`make` will compile the test program `test_htslib_interface`. Running the test program will show the mapping details for each test read, as well as details of the reference-to-query map. diff --git a/tests/detect/htslib/alignments.bam b/tests/detect/htslib/alignments.bam deleted file mode 100644 index e0df7a4..0000000 Binary files a/tests/detect/htslib/alignments.bam and /dev/null differ diff --git a/tests/detect/htslib/alignments.sorted.bam b/tests/detect/htslib/alignments.sorted.bam deleted file mode 100644 index 5600f98..0000000 Binary files a/tests/detect/htslib/alignments.sorted.bam and /dev/null differ diff --git a/tests/detect/htslib/alignments.sorted.bam.bai b/tests/detect/htslib/alignments.sorted.bam.bai deleted file mode 100644 index 4ca67ce..0000000 Binary files a/tests/detect/htslib/alignments.sorted.bam.bai and /dev/null differ diff --git a/tests/detect/htslib/reads.fasta b/tests/detect/htslib/reads.fasta deleted file mode 100644 index 349cb27..0000000 --- a/tests/detect/htslib/reads.fasta +++ /dev/null @@ -1,48 +0,0 @@ ->randSeq1 -ttcataagcggtacctctgtgctcgctttatatcgacacgattttatatgtacgcaaatt -acctagccgctacatagaggggtatacgggttattaggga ->randSeq1_5primeOverhang -ACTTGATTCttcataagcggtacctctgtgctcgctttatatcgacacgattttatatgtacgcaaatt -acctagccgctacatagaggggtatacgggttattaggga ->randSeq1_3primeOverhang -ttcataagcggtacctctgtgctcgctttatatcgacacgattttatatgtacgcaaatt -acctagccgctacatagaggggtatacgggttattagggaCACGGATTCA ->randSeq1_5primeDeletion -ttatatcgacacgattttatatgtacgcaaatt -acctagccgctacatagaggggtatacgggttattaggga ->randSeq1_3primeDeletion -ttcataagcggtacctctgtgctcgctttatatcgacacgattttatatgtacgcaaatt -accta ->randSeq1_revComp -tccctaataacccgtatacccctctatgtagcggctaggtaatttgcgtacatataaaatcgtgtcgatataaagcgagcacagaggtaccgcttatgaa ->randSeq1_revComp_5primeOverhang -ACCTtccctaataacccgtatacccctctatgtagcggctaggtaatttgcgtacatataaaatcgtgtcgatataaagcgagcacagaggtaccgcttatgaa ->randSeq1_revComp_3primeOverhang -tccctaataacccgtatacccctctatgtagcggctaggtaatttgcgtacatataaaatcgtgtcgatataaagcgagcacagaggtaccgcttatgaaAACGAATTCA ->randSeq1_revComp_5primeDeletion -ggctaggtaatttgcgtacatataaaatcgtgtcgatataaagcgagcacagaggtaccgcttatgaa ->randSeq1_revComp_3primeDeletion -tccctaataacccgtatacccctctatgtagcggctaggtaatttgcgtacatataaaatcgtgt ->randSeq1_revComp_10bpDeletionAt30 -tccctaataacccgtatacccctctatgttaatttgcgtacatataaaatcgtgtcgatataaagcgagcacagaggtaccgcttatgaa ->randSeq1_revComp_10bpInsertionAt60 -tccctaataacccgtatacccctctatgtagcggctaggtaatttgcgtacatataaaaATCGAATCGAtcgtgtcgatataaagcgagcacagaggtaccgcttatgaa ->randSeq2 -ttggagccgatgccttaatatttgcgctccctggagtggggaatcaatttgaaactcagc -ggtagtcttacgctacaagcaccgttgtctgggaactagg ->randSeq2_5primeDeletion -taatatttgcgctccctggagtggggaatcaatttgaaactcagc -ggtagtcttacgctacaagcaccgttgtctgggaactagg ->randSeq2_3primeDeletion -ttggagccgatgccttaatatttgcgctccctggagtggggaatcaatttgaaactcagc -ggtagtcttacgc ->randSeq2_revComp -cctagttcccagacaacggtgcttgtagcgtaagactaccgctgagtttcaaattgattccccactccagggagcgcaaatattaaggcatcggctccaa ->randSeq2_revComp_5primeDeletion -gcttgtagcgtaagactaccgctgagtttcaaattgattccccactccagggagcgcaaatattaaggcatcggctccaa ->randSeq2_revComp_3primeDeletion -cctagttcccagacaacggtgcttgtagcgtaagactaccgctgagtttcaaattgattccccactccaggga ->randSeq2_revComp_15bpDeletionAt20 -cctagttcccagacaacggactaccgctgagtttcaaattgattccccactccagggagcgcaaatattaaggcatcggctccaa ->randSeq2_revComp_4bpInsertionAt40 -cctagttcccagacaacggtgcttgtagcgtaagactacCGCTAcgctgagtttcaaattgattccccactccagggagcgcaaatattaaggcatcggctccaa diff --git a/tests/detect/htslib/reference.fasta b/tests/detect/htslib/reference.fasta deleted file mode 100644 index b4bd742..0000000 --- a/tests/detect/htslib/reference.fasta +++ /dev/null @@ -1,6 +0,0 @@ ->randSeq1 -ttcataagcggtacctctgtgctcgctttatatcgacacgattttatatgtacgcaaatt -acctagccgctacatagaggggtatacgggttattaggga ->randSeq2 -ttggagccgatgccttaatatttgcgctccctggagtggggaatcaatttgaaactcagc -ggtagtcttacgctacaagcaccgttgtctgggaactagg diff --git a/tests/detect/htslib/test_htslib_interface b/tests/detect/htslib/test_htslib_interface deleted file mode 100755 index 4730b25..0000000 Binary files a/tests/detect/htslib/test_htslib_interface and /dev/null differ diff --git a/tests/detect/htslib/test_htslib_interface.cpp b/tests/detect/htslib/test_htslib_interface.cpp deleted file mode 100644 index 369d248..0000000 --- a/tests/detect/htslib/test_htslib_interface.cpp +++ /dev/null @@ -1,272 +0,0 @@ -//---------------------------------------------------------- -// Copyright 2020 University of Cambridge -// Written by Michael A. Boemo (mb915@cam.ac.uk) -// This software is licensed under GPL-2.0. You should have -// received a copy of the license with this software. If -// not, please Email the author. -//---------------------------------------------------------- - -//NOTES: Tests DNAscent detect functions that interface with htslib - -#include "../../../htslib/htslib/hts.h" -#include "../../../htslib/htslib/sam.h" -#include "../../../src/data_IO.h" -#include "../../../src/error_handling.h" -#include "../../../src/common.h" //reverse complement -#include -#include -#include - -std::string getQuerySequence( bam1_t *record ){ - - std::string seq; - uint8_t *a_seq = bam_get_seq(record); - for ( int i = 0; i < record -> core.l_qseq; i++){ - int seqInBase = bam_seqi(a_seq,i); - - switch (seqInBase) { - - case 1: seq += "A"; break; - case 2: seq += "C"; break; - case 4: seq += "G"; break; - case 8: seq += "T"; break; - case 15: seq += "N"; break; - default: throw ParsingError(); - } - } - return seq; -} - -void parseCigar(bam1_t *record, std::map< unsigned int, unsigned int > &ref2query, int &refStart, int &refEnd ){ - - //initialise reference and query coordinates for the first match - refStart = record -> core.pos; - int queryPosition = 0; - int refPosition = 0; - - const uint32_t *cigar = bam_get_cigar(record); - - if ( bam_is_rev(record) ){ - - for ( int i = record -> core.n_cigar - 1; i >= 0; i--){ - - const int op = bam_cigar_op(cigar[i]); //cigar operation - const int ol = bam_cigar_oplen(cigar[i]); //number of consecutive operations - - //for a match, advance both reference and query together - if (op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF){ - - for ( int j = refPosition; j < refPosition + ol; j++ ){ - - ref2query[j] = queryPosition; - queryPosition++; - } - refPosition += ol; - } - //for a deletion, advance only the reference position - else if (op == BAM_CDEL or op == BAM_CREF_SKIP){ - - for ( int j = refPosition; j < refPosition + ol; j++ ){ - - ref2query[j] = queryPosition; - } - refPosition += ol; - } - //for insertions or soft clipping, advance only the query position - else if (op == BAM_CSOFT_CLIP or op == BAM_CINS){ - - for ( int j = refPosition; j < refPosition + ol; j++ ){ - - ref2query[j] = queryPosition; - queryPosition++; - } - } - //N.B. hard clipping advances neither reference nor query, so ignore it - } - } - else { - - for ( unsigned int i = 0; i < record -> core.n_cigar; ++i){ - - const int op = bam_cigar_op(cigar[i]); //cigar operation - const int ol = bam_cigar_oplen(cigar[i]); //number of consecutive operations - - //for a match, advance both reference and query together - if (op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF){ - - for ( int j = refPosition; j < refPosition + ol; j++ ){ - - ref2query[j] = queryPosition; - queryPosition++; - } - refPosition += ol; - } - //for a deletion, advance only the reference position - else if (op == BAM_CDEL or op == BAM_CREF_SKIP){ - - for ( int j = refPosition; j < refPosition + ol; j++ ){ - - ref2query[j] = queryPosition; - } - refPosition += ol; - } - //for insertions or soft clipping, advance only the query position - else if (op == BAM_CSOFT_CLIP or op == BAM_CINS){ - - for ( int j = refPosition; j < refPosition + ol; j++ ){ - - ref2query[j] = queryPosition; - queryPosition++; - } - } - //N.B. hard clipping advances neither reference nor query, so ignore it - } - } - refEnd = refStart + refPosition; -} - -void getRefEnd(bam1_t *record, int &refStart, int &refEnd ){ - - //initialise reference coordinates for the first match - refStart = record -> core.pos; - int refPosition = 0; - - const uint32_t *cigar = bam_get_cigar(record); - - if ( bam_is_rev(record) ){ - - for ( int i = record -> core.n_cigar - 1; i >= 0; i--){ - - const int op = bam_cigar_op(cigar[i]); //cigar operation - const int ol = bam_cigar_oplen(cigar[i]); //number of consecutive operations - - //for a match - if (op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF){ - - refPosition += ol; - } - //for a deletion - else if (op == BAM_CDEL or op == BAM_CREF_SKIP){ - - refPosition += ol; - } - //for insertions, advance only the query position so skip - //N.B. hard clipping advances neither reference nor query, so ignore it - } - } - else { - - for ( unsigned int i = 0; i < record -> core.n_cigar; ++i){ - - const int op = bam_cigar_op(cigar[i]); //cigar operation - const int ol = bam_cigar_oplen(cigar[i]); //number of consecutive operations - - //for a match, advance both reference and query together - if (op == BAM_CMATCH or op == BAM_CEQUAL or op == BAM_CDIFF){ - - refPosition += ol; - } - //for a deletion, advance only the reference position - else if (op == BAM_CDEL or op == BAM_CREF_SKIP){ - - refPosition += ol; - } - //for insertions, advance only the query position so skip - //N.B. hard clipping advances neither reference nor query, so ignore it - } - } - refEnd = refStart + refPosition; -} - -int main(void){ - - std::string fn_bam = "alignments.sorted.bam"; - std::string fn_reference = "reference.fasta"; - - htsFile* bam_fh; - hts_idx_t* bam_idx; - bam_hdr_t* bam_hdr; - hts_itr_t* itr; - - //import fasta reference - std::map< std::string, std::string > reference = import_reference_pfasta( fn_reference ); - - //load the bam - std::cout << "Opening bam file... "; - bam_fh = sam_open((fn_bam).c_str(), "r"); - if (bam_fh == NULL) throw IOerror(fn_bam); - - //load the index - bam_idx = sam_index_load(bam_fh, (fn_bam).c_str()); - if (bam_idx == NULL) throw IOerror("index for "+fn_bam); - - //load the header - bam_hdr = sam_hdr_read(bam_fh); - std::cout << "ok." << std::endl; - - //build an iterator for all reads in the bam file - const char *allReads = "."; - itr = sam_itr_querys(bam_idx,bam_hdr,allReads); - int result; - - do { - //initialise the record and get the record from the file iterator - bam1_t *record = bam_init1(); - result = sam_itr_next(bam_fh, itr, record); - - //get the read name (which will be the ONT readID from Albacore basecall) - const char *queryName = bam_get_qname(record); - if (queryName == NULL) continue; - std::string s_queryName(queryName); - std::cout <<"=================================================================" << std::endl; - std::cout <<"Read name: " << s_queryName << std::endl; - - //get the name of the reference mapped to - std::string mappedTo(bam_hdr -> target_name[record -> core.tid]); - - std::cout << "Mapped to reference: " << mappedTo << std::endl; - - int mappingQual = record -> core.qual; - int refStart,refEnd; - getRefEnd(record,refStart,refEnd); - - std::cout <<"Mapping quality: " << mappingQual << std::endl; - std::cout <<"Mapped between reference coords: " << refStart << ", " << refEnd << std::endl; - - - //fetch the basecall from the bam file - std::string basecall = getQuerySequence(record); - - /*get the subsequence of the reference this read mapped to */ - std::string referenceSeqMappedTo = reference.at(mappedTo).substr(refStart, refEnd - refStart); - - //account for reverse complements - if ( bam_is_rev(record) ){ - - basecall = reverseComplement( basecall ); - referenceSeqMappedTo = reverseComplement( referenceSeqMappedTo ); - } - - //iterate on the cigar string to fill up the reference-to-query coordinate map - std::map refToQuery; - parseCigar(record, refToQuery, refStart, refEnd); - - //print ref2query results - for (auto dic = refToQuery.begin(); dic != refToQuery.end(); dic++){ - - //print the position alignment - std::cout << dic -> first << " --> " << dic -> second << std::endl; - - //print the bases - std::string refBase, queryBase; - if (dic -> first > referenceSeqMappedTo.length()-1) refBase = "X"; - else refBase = referenceSeqMappedTo.substr(dic -> first,1); - if (dic -> second > basecall.length()-1) refBase = "X"; - else queryBase = basecall.substr(dic -> second,1); - std::cout << refBase << " --> " << queryBase << std::endl; - } - - bam_destroy1(record); - - } while(result > 0); -} diff --git a/tests/forkSense/originConfidenceWindows/README.md b/tests/forkSense/originConfidenceWindows/README.md deleted file mode 100644 index 1bfc769..0000000 --- a/tests/forkSense/originConfidenceWindows/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# DNAscent Test - Origin Confidence - -DNAscent forkSense can output a bed file with replication origin locations, and specifies confidence windows of where these origins are located. This test checks the width of these confidence windows in early S-phase cells where the confidence windows should be small. - -## Files - -`originWindowWidth.py` - -## Running - -Run DNAscent detect followd by forkSense with --markOrigins, ideally on the 1x BrdU cell cycle data released with the Nature Methods paper. Usage for the script is: -`python originWindowWidth.py /path/to/origins_DNAscent_forkSense.bed` -This will plot a distribution of confidence window sizes. diff --git a/tests/forkSense/originConfidenceWindows/originWindowWidth.py b/tests/forkSense/originConfidenceWindows/originWindowWidth.py deleted file mode 100644 index f98df62..0000000 --- a/tests/forkSense/originConfidenceWindows/originWindowWidth.py +++ /dev/null @@ -1,25 +0,0 @@ -import matplotlib -matplotlib.use('Agg') -from matplotlib import pyplot as plt -import sys - -buf = [] - -f = open(sys.argv[1],'r') -for line in f: - splitLine = line.rstrip().split() - chromosome = splitLine[0] - if chromosome == 'chrM': - continue - dist = int(splitLine[2]) - int(splitLine[1]) - buf.append(dist) - -f.close() - -plt.figure() -plt.hist(buf,50) -plt.xlim(0,10000) -plt.xlabel('Size of Origin Call Confidence Window') -plt.ylabel('Count') -plt.savefig('originWidths.pdf') -plt.close() diff --git a/tests/forkSense/originLocation/README.md b/tests/forkSense/originLocation/README.md deleted file mode 100644 index 62aa3f7..0000000 --- a/tests/forkSense/originLocation/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# DNAscent Test - Origin Calling Accuracy - -This test checks the accuracy of forkSense origin calls by making sure they are generally close to known origins in S. cerevisiae. - -## Files - -`distanceFromNearestOrigin.py` - -This will also require a bed file of all origins from oridb (http://cerevisiae.oridb.org/). - -## Running - -Run DNAscent detect followd by forkSense with --markOrigins, ideally on the 1x BrdU cell cycle data released with the Nature Methods paper. Usage for the script is: -`python distanceFromNearestOrigin.py /path/to/origins_DNAscent_forkSense.bed` -This will show a distribution of distances from DNAscent origin calls to the nearest known origin. diff --git a/tests/forkSense/originLocation/distanceFromNearestOrigin.py b/tests/forkSense/originLocation/distanceFromNearestOrigin.py deleted file mode 100644 index 6bc2491..0000000 --- a/tests/forkSense/originLocation/distanceFromNearestOrigin.py +++ /dev/null @@ -1,94 +0,0 @@ -import matplotlib -matplotlib.use('Agg') -from matplotlib import pyplot as plt -import sys -import numpy as np - -#path to oridb bed file -f_oridb = '/home/mb915/rds/rds-mb915-notbackedup/oridb.bed' - -def namefix(num): - - if num == 'chr1': - return 'chrI' - elif num == 'chr2': - return 'chrII' - elif num == 'chr3': - return 'chrIII' - elif num == 'chr4': - return 'chrIV' - elif num == 'chr5': - return 'chrV' - elif num == 'chr6': - return 'chrVI' - elif num == 'chr7': - return 'chrVII' - elif num == 'chr8': - return 'chrVIII' - elif num == 'chr9': - return 'chrIX' - elif num == 'chr10': - return 'chrX' - elif num == 'chr11': - return 'chrXI' - elif num == 'chr12': - return 'chrXII' - elif num == 'chr13': - return 'chrXIII' - elif num == 'chr14': - return 'chrXIV' - elif num == 'chr15': - return 'chrXV' - elif num == 'chr16': - return 'chrXVI' - else: - print(num) - print('problem') - -#parse oridb -f = open(f_oridb,'r') -oridb_chr2bounds = {} -for line in f: - splitLine = line.rstrip().split() - chromosome = namefix(splitLine[0]) - lb = int(splitLine[1]) - ub = int(splitLine[2]) - if chromosome in oridb_chr2bounds: - oridb_chr2bounds[chromosome].append((lb,ub)) - else: - oridb_chr2bounds[chromosome] = [(lb,ub)] -f.close() - -#parse the origin calls -distances = [] -f = open(sys.argv[1],'r') -for line in f: - splitLine = line.rstrip().split() - chromosome = splitLine[0] - - if chromosome == 'chrM': - continue - - lb = int(splitLine[2]) - ub = int(splitLine[1]) - - minDist = 1000000000 - for ori in oridb_chr2bounds[chromosome]: - if ori[0] < lb < ori[1] or ori[0] < ub < ori[1] or lb < ori[0] < ub or lb < ori[1] < ub: - minDist = 0 - break - elif (ori[1] < lb and lb - ori[1] < minDist): - minDist = lb - ori[1] - elif (ub < ori[0] and ori[0] - ub < minDist): - minDist = ori[0] - ub - distances.append(minDist) -f.close() - -plt.figure() -plt.hist(distances,50) -plt.xlabel('Distance to Nearest Origin') -plt.ylabel('Count') -plt.savefig('distToNearestOri.pdf') -plt.close() - -print(np.mean(distances)) diff --git a/tests/regions/clustering/README.md b/tests/regions/clustering/README.md deleted file mode 100644 index 68e38c8..0000000 --- a/tests/regions/clustering/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# DNAscent Test - Clustering - -This test checks the ability of DNAscent regions to find the T-to-B substitution rate in BrdU-positive regions. - -## Files - -`testClustering.py` - -## Running - -Set #define TEST_CLUSTERING 1 in DNAscent regions and recompile. Run DNAscent regions on any run that has undergone a BrdU pulse and redirect stderr to file.txt. Run `python testClustering.py file.txt`. diff --git a/tests/regions/clustering/testClustering.py b/tests/regions/clustering/testClustering.py deleted file mode 100644 index 70e9a54..0000000 --- a/tests/regions/clustering/testClustering.py +++ /dev/null @@ -1,32 +0,0 @@ -import matplotlib -matplotlib.use('Agg') -from matplotlib import pyplot as plt -import numpy as np -import sys - -#Usage: python testClustering.py DNAscentregions.stderr -#where DNAscentregions.stderr is from stderr after running DNAscent regions with #define TEST_CLUSTERING 1 - - -centroids = [] -clusteredPoints = [[],[]] -index = -1 - -f = open(sys.argv[1],'r') -for line in f: - if line[0] == '>': - centroids.append(float(line.rstrip()[1:])) - index += 1 - else: - clusteredPoints[index].append(float(line.rstrip())) -f.close() - -plt.figure() -plt.hist(clusteredPoints[1]+clusteredPoints[0],50,alpha=0.1) -plt.hist(clusteredPoints[0],50,alpha=0.3) -plt.scatter(centroids[0],[1]) -plt.hist(clusteredPoints[1],50,alpha=0.3) -plt.scatter(centroids[1],[1]) -plt.xlabel('P(Thym is BrdU)') -plt.ylabel('Count') -plt.savefig('regions_clusterTest.pdf') diff --git a/tests/regions/mixture/README.md b/tests/regions/mixture/README.md deleted file mode 100644 index 48bb6dc..0000000 --- a/tests/regions/mixture/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# DNAscent Test - Mixture in Regions - -This test checks the ability of DNAscent regions to create two separable populations of BrdU and thymidine regions. - -## Files - -`testRegionSeparation.py` - -## Running - -Run DNAscent regions to produce the normal output DNAscent.regions. Then run `python testRegionSeparation.py DNAscent.regions`. diff --git a/tests/regions/mixture/testRegionSeparation.py b/tests/regions/mixture/testRegionSeparation.py deleted file mode 100644 index 9a6f486..0000000 --- a/tests/regions/mixture/testRegionSeparation.py +++ /dev/null @@ -1,29 +0,0 @@ -import matplotlib -matplotlib.use('Agg') -from matplotlib import pyplot as plt -import numpy as np -import sys - -#Usage: python testClustering.py DNAscent.regions -#where DNAscent.regions is the normal output from DNAscent regions - -regionScores = [] -f = open(sys.argv[1],'r') -for line in f: - - if len(line.rstrip()) == 0: - continue - - if line[0] == '>': - continue - else: - splitLine = line.rstrip().split() - regionScores.append(float(splitLine[2])) -f.close() - -plt.figure() -plt.hist(regionScores,50) -plt.xlabel('Region Score') -plt.ylabel('Count') -plt.savefig('regionScoreHistogram.pdf') -plt.close() diff --git a/utils/dnascent2bedgraph.py b/utils/dnascent2bedgraph.py index 13f88c7..aeb93da 100644 --- a/utils/dnascent2bedgraph.py +++ b/utils/dnascent2bedgraph.py @@ -1,5 +1,5 @@ #---------------------------------------------------------- -# Copyright 2019-2020 University of Oxford +# Copyright 2020 University of Cambridge # Written by Michael A. Boemo (mb915@cam.ac.uk) # This software is licensed under GPL-3.0. You should have # received a copy of the license with this software. If @@ -12,21 +12,21 @@ #-------------------------------------------------------------------------------------------------------------------------------------- def splashHelp(): - s = """dnascent2bedgraph.py: Converts the output of DNAscent detect, regions, and forkSense into bedgraphs. + s = """dnascent2bedgraph.py: Converts the output of DNAscent detect and forkSense into bedgraphs. To run dnascent2bedgraph.py, do: python dnascent2bedgraph.py [arguments] Example: python dnascent2bedgraph.py -d /path/to/dnascentDetect.out -f /path/to/dnascentForksense.out -o /path/to/newBedgraphDir Required arguments are at least one of the following: -d,--detect path to DNAscent detect output file, - -f,--forkSense path to DNAscent forkSense output file, - -r,--regions path to DNAscent regions output file. + -f,--forkSense path to DNAscent forkSense output file. Required argument is: -o,--output output directory which will be created. Optional arguments are: --minLength only convert reads with specified minimum read length (in base pairs) into bedgraphs (default: 1), --maxLength only convert reads with specified maximum read length (in base pairs) into bedgraphs (default: Inf), -n,--maxReads maximum number of reads to convert into bedgraphs (default: Inf), + --targets forkSense bed file with specific reads to plot, --filesPerDir maximum reads per subdirectory (default: 300). Written by Michael Boemo, Department of Pathology, University of Cambridge. Please submit bug reports to GitHub Issues (https://github.com/MBoemo/DNAscent/issues).""" @@ -49,6 +49,7 @@ def parseArguments(args): a.maxLength = 1000000000 a.maxReads = 1000000000 a.filesPerDir = 300 + a.useTargets = False for i, argument in enumerate(args): @@ -58,9 +59,6 @@ def parseArguments(args): elif argument == '-f' or argument == '--forkSense': a.sensePath = str(args[i+1]) - elif argument == '-r' or argument == '--regions': - a.regionsPath = str(args[i+1]) - elif argument == '-o' or argument == '--output': a.outDir = str(args[i+1]) @@ -73,6 +71,10 @@ def parseArguments(args): elif argument == '-n' or argument == '--maxReads': a.maxReads = int(args[i+1]) + elif argument == '--targets': + a.targetPath = str(args[i+1]) + a.useTargets = True + elif argument == '--filesPerDir': a.filesPerDir = int(args[i+1]) @@ -80,7 +82,7 @@ def parseArguments(args): splashHelp() #check that required arguments are met - if not ( ( hasattr( a, 'detectPath') or hasattr( a, 'sensePath') or hasattr( a, 'regionsPath') ) and hasattr( a, 'outDir') ): + if not ( ( hasattr( a, 'detectPath') or hasattr( a, 'sensePath') ) and hasattr( a, 'outDir') ): splashHelp() return a @@ -89,31 +91,23 @@ def parseArguments(args): def makeDetectLine(line, chromosome): splitLine = line.rstrip().split() pos = int(splitLine[0]) - probBrdU = float(splitLine[1]) + probBrdU = float(splitLine[2]) + probEdU = float(splitLine[1]) sixMer = splitLine[2] - return chromosome + ' ' + str(pos) + ' ' + str(pos+1) + ' ' + str(probBrdU) + '\n' + return (chromosome + ' ' + str(pos) + ' ' + str(pos+1) + ' ' + str(probBrdU) + '\n',chromosome + ' ' + str(pos) + ' ' + str(pos+1) + ' ' + str(probEdU) + '\n') #-------------------------------------------------------------------------------------------------------------------------------------- def makeSenseLine(line, chromosome, prevPos): splitLine = line.rstrip().split() pos = int(splitLine[0]) - probForkLeft = float(splitLine[1]) - probForkRight = float(splitLine[2]) - return (chromosome + ' ' + str(prevPos) + ' ' + str(pos) + ' ' + str(probForkLeft) + '\n', chromosome + ' ' + str(prevPos) + ' ' + str(pos) + ' ' + str(probForkRight) + '\n') + probEdUsegment = float(splitLine[1]) + probBrdUsegment = float(splitLine[2]) + return (chromosome + ' ' + str(prevPos) + ' ' + str(pos) + ' ' + str(probBrdUsegment) + '\n', chromosome + ' ' + str(prevPos) + ' ' + str(pos) + ' ' + str(probEdUsegment) + '\n', chromosome + ' ' + str(prevPos) + ' ' + str(pos) + ' ' + '\n') #-------------------------------------------------------------------------------------------------------------------------------------- -def makeRegionsLine(line, chromosome): - splitLine = line.rstrip().split() - posStart = int(splitLine[0]) - posEnd = int(splitLine[1]) - regionScore = float(splitLine[2]) - return chromosome + ' ' + str(posStart) + ' ' + str(posEnd) + ' ' + str(regionScore) + '\n' - - -#-------------------------------------------------------------------------------------------------------------------------------------- -def parseBaseFile(fname, args): +def parseBaseFile(fname, args, targetIDs): print('Parsing '+fname[0]+'...') first = True count = 0 @@ -134,6 +128,22 @@ def parseBaseFile(fname, args): if not first: + if args.useTargets and readID not in targetIDs: + + #get readID and chromosome + splitLine = line.rstrip().split(' ') + readID = splitLine[0][1:] + chromosome = splitLine[1] + strand = splitLine[4] + mappingStart = int(splitLine[2]) + mappingEnd = int(splitLine[3]) + prevPos = mappingStart + + first = False + buff = [] + + continue + rLen = mappingEnd - mappingStart if rLen > args.minLength and rLen < args.maxLength: @@ -151,38 +161,34 @@ def parseBaseFile(fname, args): readID2directory[readID] = directoryCount if fname[1] == "detect": - f_bg = open( args.outDir + '/' + str(directoryCount) + '/' + readID + '.detect.bedgraph','w') + f_bg = open( args.outDir + '/' + str(directoryCount) + '/' + readID + '.BrdUdetect.bedgraph','w') + f_bg2 = open( args.outDir + '/' + str(directoryCount) + '/' + readID + '.EdUdetect.bedgraph','w') f_bg.write( 'track type=bedGraph name="'+readID +'" description="BedGraph format" visibility=full color=200,100,0 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') + f_bg2.write( 'track type=bedGraph name="'+readID +'" description="BedGraph format" visibility=full color=93,197,186 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') for l in buff: - f_bg.write(l) + f_bg.write(l[0]) + f_bg2.write(l[1]) f_bg.close() - - elif fname[1] == "regions": - f_regions = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_regions.bedgraph','w') - f_regions.write( 'track type=bedGraph name="'+readID + '_' + strand + '_regions'+'" description="BedGraph format" visibility=full color=200,100,0 altColor=0,100,200 priority=20 viewLimits=-3.0:3.0'+'\n') - - for l in buff: - f_regions.write(l) - f_regions.close() + f_bg2.close() elif fname[1] == "sense": #leftward moving fork - f_forkLeft = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_forkLeft.bedgraph','w') - f_forkLeft.write( 'track type=bedGraph name="'+readID + '_' + strand + '_forkLeft'+'" description="BedGraph format" visibility=full color=200,100,0 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') + f_BrdUsegment = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_BrdUsegment.bedgraph','w') + f_BrdUsegment.write( 'track type=bedGraph name="'+readID + '_' + strand + '_BrdUsegment'+'" description="BedGraph format" visibility=full color=200,100,0 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') for l in buff: - f_forkLeft.write(l[0]) - f_forkLeft.close() + f_BrdUsegment.write(l[0]) + f_BrdUsegment.close() #rightward moving fork - f_forkRight = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_forkRight.bedgraph','w') - f_forkRight.write( 'track type=bedGraph name="'+readID + '_' + strand + '_forkRight'+'" description="BedGraph format" visibility=full color=0,0,255 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') + f_EdUsegment = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_EdUsegment.bedgraph','w') + f_EdUsegment.write( 'track type=bedGraph name="'+readID + '_' + strand + '_EdUsegment'+'" description="BedGraph format" visibility=full color=93,197,186 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') for l in buff: - f_forkRight.write(l[1]) - f_forkRight.close() - + f_EdUsegment.write(l[1]) + f_EdUsegment.close() + #get readID and chromosome splitLine = line.rstrip().split(' ') @@ -196,12 +202,26 @@ def parseBaseFile(fname, args): first = False buff = [] + elif line[0] == '%': + continue + + + elif args.useTargets: + + if readID not in targetIDs: + continue + else: + if fname[1] == "detect": + buff.append( makeDetectLine(line,chromosome) ) + elif fname[1] == "sense": + splitLine = line.rstrip().split() + pos = int(splitLine[0]) + buff.append( makeSenseLine(line,chromosome,prevPos) ) + prevPos = pos else: if fname[1] == "detect": buff.append( makeDetectLine(line,chromosome) ) - elif fname[1] == "regions": - buff.append( makeRegionsLine(line,chromosome) ) elif fname[1] == "sense": splitLine = line.rstrip().split() pos = int(splitLine[0]) @@ -220,45 +240,41 @@ def parseBaseFile(fname, args): readID2directory[readID] = directoryCount if fname[1] == "detect": - f_bg = open( args.outDir + '/' + str(directoryCount) + '/' + readID + '.detect.bedgraph','w') + f_bg = open( args.outDir + '/' + str(directoryCount) + '/' + readID + '.BrdUdetect.bedgraph','w') + f_bg2 = open( args.outDir + '/' + str(directoryCount) + '/' + readID + '.EdUdetect.bedgraph','w') f_bg.write( 'track type=bedGraph name="'+readID +'" description="BedGraph format" visibility=full color=200,100,0 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') + f_bg2.write( 'track type=bedGraph name="'+readID +'" description="BedGraph format" visibility=full color=93,197,186 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') for l in buff: - f_bg.write(l) + f_bg.write(l[0]) + f_bg2.write(l[1]) f_bg.close() - - elif fname[1] == "regions": - f_regions = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_regions.bedgraph','w') - f_regions.write( 'track type=bedGraph name="'+readID + '_' + strand + '_regions'+'" description="BedGraph format" visibility=full color=200,100,0 altColor=0,100,200 priority=20 viewLimits=-3.0:3.0'+'\n') - - for l in buff: - f_regions.write(l) - f_regions.close() + f_bg2.close() elif fname[1] == "sense": #leftward moving fork - f_forkLeft = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_forkLeft.bedgraph','w') - f_forkLeft.write( 'track type=bedGraph name="'+readID + '_' + strand + '_forkLeft'+'" description="BedGraph format" visibility=full color=200,100,0 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') + f_BrdUsegment = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_BrdUsegment.bedgraph','w') + f_BrdUsegment.write( 'track type=bedGraph name="'+readID + '_' + strand + '_BrdUsegment'+'" description="BedGraph format" visibility=full color=200,100,0 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') for l in buff: - f_forkLeft.write(l[0]) - f_forkLeft.close() + f_BrdUsegment.write(l[0]) + f_BrdUsegment.close() #rightward moving fork - f_forkRight = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_forkRight.bedgraph','w') - f_forkRight.write( 'track type=bedGraph name="'+readID + '_' + strand + '_forkRight'+'" description="BedGraph format" visibility=full color=0,0,255 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') + f_EdUsegment = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_EdUsegment.bedgraph','w') + f_EdUsegment.write( 'track type=bedGraph name="'+readID + '_' + strand + '_EdUsegment'+'" description="BedGraph format" visibility=full color=93,197,186 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') for l in buff: - f_forkRight.write(l[1]) - f_forkRight.close() - + f_EdUsegment.write(l[1]) + f_EdUsegment.close() + f.close() print('Done.') return readID2directory #-------------------------------------------------------------------------------------------------------------------------------------- -def parseSecondaryFile(fname, readID2directory,args): +def parseSecondaryFile(fname, readID2directory,args, targetIDs): print('Parsing '+fname[0]+'...') f = open(fname[0],'r') first = True @@ -278,6 +294,22 @@ def parseSecondaryFile(fname, readID2directory,args): if not first: + if args.useTargets and readID not in targetIDs: + + #get readID and chromosome + splitLine = line.rstrip().split(' ') + readID = splitLine[0][1:] + chromosome = splitLine[1] + strand = splitLine[-1:][0] + mappingStart = int(splitLine[2]) + mappingEnd = int(splitLine[3]) + prevPos = mappingStart + + first = False + buff = [] + + continue + rLen = mappingEnd - mappingStart if rLen > args.minLength and rLen < args.maxLength: @@ -291,29 +323,21 @@ def parseSecondaryFile(fname, readID2directory,args): if fname[1] == "sense": #leftward moving fork - f_forkLeft = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_forkLeft.bedgraph','w') - f_forkLeft.write( 'track type=bedGraph name="'+readID + '_' + strand + '_forkLeft'+'" description="BedGraph format" visibility=full color=200,100,0 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') + f_BrdUsegment = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_BrdUsegment.bedgraph','w') + f_BrdUsegment.write( 'track type=bedGraph name="'+readID + '_' + strand + '_BrdUsegment'+'" description="BedGraph format" visibility=full color=200,100,0 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') for l in buff: - f_forkLeft.write(l[0]) - f_forkLeft.close() + f_BrdUsegment.write(l[0]) + f_BrdUsegment.close() #rightward moving fork - f_forkRight = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_forkRight.bedgraph','w') - f_forkRight.write( 'track type=bedGraph name="'+readID + '_' + strand + '_forkRight'+'" description="BedGraph format" visibility=full color=0,0,255 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') + f_EdUsegment = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_EdUsegment.bedgraph','w') + f_EdUsegment.write( 'track type=bedGraph name="'+readID + '_' + strand + '_EdUsegment'+'" description="BedGraph format" visibility=full color=93,197,186 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') for l in buff: - f_forkRight.write(l[1]) - f_forkRight.close() - - - elif fname[1] == "regions": - f_regions = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_regions.bedgraph','w') - f_regions.write( 'track type=bedGraph name="'+readID + '_' + strand + '_regions'+'" description="BedGraph format" visibility=full color=200,100,0 altColor=0,100,200 priority=20 viewLimits=-3.0:3.0'+'\n') - - for l in buff: - f_regions.write(l) - f_regions.close() + f_EdUsegment.write(l[1]) + f_EdUsegment.close() + #get readID and chromosome splitLine = line.rstrip().split(' ') @@ -326,7 +350,19 @@ def parseSecondaryFile(fname, readID2directory,args): first = False buff = [] + elif line[0] == '%': + continue + + elif args.useTargets: + if readID not in targetIDs: + continue + else: + if fname[1] == "sense": + splitLine = line.rstrip().split() + pos = int(splitLine[0]) + buff.append( makeSenseLine(line,chromosome,prevPos) ) + prevPos = pos else: if fname[1] == "sense": @@ -334,38 +370,29 @@ def parseSecondaryFile(fname, readID2directory,args): pos = int(splitLine[0]) buff.append( makeSenseLine(line,chromosome,prevPos) ) prevPos = pos - elif fname[1] == "regions": - buff.append( makeRegionsLine(line,chromosome) ) rLen = mappingEnd - mappingStart if rLen > args.minLength and rLen < args.maxLength and count < args.maxReads and readID in readID2directory: - if fname[1] == "regions": - f_regions = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_regions.bedgraph','w') - f_regions.write( 'track type=bedGraph name="'+readID + '_' + strand + '_regions'+'" description="BedGraph format" visibility=full color=200,100,0 altColor=0,100,200 priority=20 viewLimits=-3.0:3.0'+'\n') - - for l in buff: - f_regions.write(l) - f_regions.close() - - elif fname[1] == "sense": + if fname[1] == "sense": #leftward moving fork - f_forkLeft = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_forkLeft.bedgraph','w') - f_forkLeft.write( 'track type=bedGraph name="'+readID + '_' + strand + '_forkLeft'+'" description="BedGraph format" visibility=full color=200,100,0 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') + f_BrdUsegment = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_BrdUsegment.bedgraph','w') + f_BrdUsegment.write( 'track type=bedGraph name="'+readID + '_' + strand + '_BrdUsegment'+'" description="BedGraph format" visibility=full color=200,100,0 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') for l in buff: - f_forkLeft.write(l[0]) - f_forkLeft.close() + f_BrdUsegment.write(l[0]) + f_BrdUsegment.close() #rightward moving fork - f_forkRight = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_forkRight.bedgraph','w') - f_forkRight.write( 'track type=bedGraph name="'+readID + '_' + strand + '_forkRight'+'" description="BedGraph format" visibility=full color=0,0,255 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') + f_EdUsegment = open( args.outDir + '/' + str(readID2directory[readID]) + '/' + readID + '_EdUsegment.bedgraph','w') + f_EdUsegment.write( 'track type=bedGraph name="'+readID + '_' + strand + '_EdUsegment'+'" description="BedGraph format" visibility=full color=93,197,186 altColor=0,100,200 priority=20 viewLimits=0.0:1.0'+'\n') for l in buff: - f_forkRight.write(l[1]) - f_forkRight.close() + f_EdUsegment.write(l[1]) + f_EdUsegment.close() + print('Done.') f.close() @@ -376,6 +403,17 @@ def parseSecondaryFile(fname, readID2directory,args): args = parseArguments(sys.argv[1:]) +targetIDs = [] +if args.useTargets: + f = open(args.targetPath,'r') + for line in f: + if line[0] == '#': + continue + splitLine = line.rstrip().split() + readID = splitLine[3] + targetIDs.append(readID) + f.close() + #check the output if args.outDir[-1:] == "/": args.outDir = args.outDir[:-1] @@ -392,46 +430,11 @@ def parseSecondaryFile(fname, readID2directory,args): baseFname = (args.detectPath,"detect") if hasattr( args, 'sensePath'): secondaryFname.append((args.sensePath,"sense")) - if hasattr( args, 'regionsPath'): - secondaryFname.append((args.regionsPath,"regions")) - else: - if hasattr( args, 'regionsPath') and hasattr( args, 'sensePath'): - - #check which one has more reads, in case they were run on partial datasets - f = open(args.sensePath,'r') - readCountSense = 0 - for line in f: - if line[0] == '>': - readCountSense += 1 - f.close() - - f = open(args.regionsPath,'r') - readCountRegions = 0 - for line in f: - if line[0] == '>': - readCountRegions += 1 - f.close() - - if readCountRegions > readCountSense: - baseFname = (args.regionsPath,"regions") - secondaryFname.append((args.sensePath,"sense")) - else: - baseFname = (args.sensePath,"sense") - secondaryFname.append((args.regionsPath,"regions")) - - elif hasattr( args, 'regionsPath'): - baseFname = (args.regionsPath,"regions") - - elif hasattr( args, 'sensePath'): + if hasattr( args, 'sensePath'): baseFname = (args.sensePath,"sense") -readID2directory = parseBaseFile(baseFname, args) +readID2directory = parseBaseFile(baseFname, args, targetIDs) for fname in secondaryFname: - parseSecondaryFile(fname, readID2directory, args) - - - - - + parseSecondaryFile(fname, readID2directory, args, targetIDs)