diff --git a/.gitignore b/.gitignore
index 1e830bf..c95bf93 100755
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@ examples/
hdf5-1.8.14/
tensorflow/
hdf5-1.8.14.tar.gz
+htslib/
#generated
.depend
@@ -26,8 +27,9 @@ hdf5-1.8.14.tar.gz
.pydevproject
.settings/*
-#commit log file
+#commit log file and path
src/gitcommit.h
+src/softwarepath.h
#plots
*.png
diff --git a/Makefile b/Makefile
index 7b731a1..eea974c 100755
--- a/Makefile
+++ b/Makefile
@@ -1,30 +1,43 @@
CC = gcc
CXX = g++
DEBUG = -g
-LIBFLAGS =
+LIBFLAGS = -lrt
+LDFLAGS ?= -ldl -llzma -lbz2 -lm -lz
CXXFLAGS = -Wall -O2 -fopenmp -std=c++14
CFLAGS = -Wall -std=c99 -O2
+SPACE:= ;
+SPACE+=;
+null :=
+space := ${null} ${null}
+${space} := ${space}
+
+CURRENT_PATH := $(subst $(lastword $(notdir $(MAKEFILE_LIST))),,$(subst $(SPACE),\$(SPACE),$(shell realpath '$(strip $(MAKEFILE_LIST))')))
+PATH_SPACEFIX := $(subst ${space},\${space},${CURRENT_PATH})
+
+ifeq ($(zstd),1)
+ LDFLAGS += -lzstd
+endif
+
#hdf5
H5_LIB = ./hdf5-1.8.14/hdf5/lib/libhdf5.a
H5_INCLUDE = -I./hdf5-1.8.14/hdf5/include
-LIBFLAGS += -Wl,-rpath,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))hdf5-1.8.14/hdf5/lib -L hdf5-1.8.14/hdf5/lib -lhdf5
#hts
HTS_LIB = ./htslib/libhts.a
HTS_INCLUDE = -I./htslib
-LIBFLAGS += -Wl,-rpath,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))htslib -L htslib/ -lhts
#tensorflow
-TENS_LIB = ./tensorflow/include/tensorflow/c/c_api.h
+TENS_DEPEND = tensorflow/include/tensorflow/c/c_api.h
+TENS_LIB = -Wl,-rpath,${PATH_SPACEFIX}tensorflow/lib -L tensorflow/lib
TENS_INCLUDE = -I./tensorflow/include
-LIBFLAGS += -Wl,-rpath,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))tensorflow/lib -L tensorflow/lib -ltensorflow
+LIBFLAGS = -ltensorflow
#fast5
FAST5_INCLUDE = -I./fast5/include
#add include flags for each library
-CXXFLAGS += $(H5_INCLUDE) $(HTS_INCLUDE) $(FAST5_INCLUDE) $(TENS_INCLUDE)
+CPPFLAGS += $(H5_INCLUDE) $(HTS_INCLUDE) $(FAST5_INCLUDE) $(TENS_INCLUDE)
MAIN_EXECUTABLE = bin/DNAscent
@@ -47,12 +60,12 @@ tensorflow/include/tensorflow/c/c_api.h:
if [ ! -e tensorflow/include/tensorflow/c/c_api.h ]; then \
mkdir tensorflow; \
cd tensorflow; \
- wget https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-linux-x86_64-1.15.0.tar.gz; \
- tar -xzf libtensorflow-gpu-linux-x86_64-1.15.0.tar.gz || exit 255; \
+ wget https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-linux-x86_64-2.4.1.tar.gz; \
+ tar -xzf libtensorflow-gpu-linux-x86_64-2.4.1.tar.gz || exit 255; \
cd ..; \
fi
-SUBDIRS = src src/scrappie src/pfasta
+SUBDIRS = src src/scrappie src/pfasta src/sgsmooth
CPP_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.cpp))
C_SRC := $(foreach dir, $(SUBDIRS), $(wildcard $(dir)/*.c))
EXE_SRC = src/DNAscent.cpp
@@ -61,27 +74,31 @@ EXE_SRC = src/DNAscent.cpp
src/gitcommit.h: .git/HEAD .git/index
echo "const char *gitcommit = \"$(shell git rev-parse HEAD)\";" > $@
+#log the software path
+src/softwarepath.h:
+ echo "const char *executablePath = \"${PATH_SPACEFIX}\";" > $@
+
#generate object names
CPP_OBJ = $(CPP_SRC:.cpp=.o)
C_OBJ = $(C_SRC:.c=.o)
depend: .depend
-.depend: $(CPP_SRC) $(C_SRC) $(EXE_SRC) $(H5_LIB) $(TENS_LIB) src/gitcommit.h
+.depend: $(CPP_SRC) $(C_SRC) $(EXE_SRC) $(H5_LIB) $(TENS_DEPEND) src/gitcommit.h src/softwarepath.h
rm -f ./.depend
- $(CXX) $(CXXFLAGS) -MM $(CPP_SRC) $(C_SRC) > ./.depend;
+ $(CXX) $(CXXFLAGS) $(CPPFLAGS) -MM $(CPP_SRC) $(C_SRC) > ./.depend;
#compile each object
-.cpp.o: src/gitcommit.h
- $(CXX) -o $@ -c $(CXXFLAGS) -fPIC $<
+.cpp.o: src/gitcommit.h src/softwarepath.h
+ $(CXX) -o $@ -c $(CXXFLAGS) $(CPPFLAGS) -fPIC $<
.c.o:
- $(CC) -o $@ -c $(CFLAGS) $(H5_INCLUDE) -fPIC $<
+ $(CC) -o $@ -c $(CFLAGS) $(CPPFLAGS) $(H5_INCLUDE) -fPIC $<
#compile the main executable
-$(MAIN_EXECUTABLE): src/DNAscent.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(TENS_LIB) src/gitcommit.h
- $(CXX) -o $@ $(CXXFLAGS) -fPIC $(CPP_OBJ) $(C_OBJ) $(LIBFLAGS)
+$(MAIN_EXECUTABLE): src/DNAscent.o $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(TENS_DEPEND) src/gitcommit.h src/softwarepath.h
+ $(CXX) -o $@ $(CXXFLAGS) $(CPPFLAGS) -fPIC $(CPP_OBJ) $(C_OBJ) $(HTS_LIB) $(H5_LIB) $(TENS_LIB) $(LIBFLAGS) $(LDFLAGS)
clean:
- rm -f $(MAIN_EXECUTABLE) $(CPP_OBJ) $(C_OBJ) src/DNAscent.o gitcommit.h
+ rm -f $(MAIN_EXECUTABLE) $(CPP_OBJ) $(C_OBJ) src/DNAscent.o src/gitcommit.h src/softwarepath.h
diff --git a/README.md b/README.md
index d04cedb..95ec1aa 100755
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ git clone --recursive https://github.com/MBoemo/DNAscent.git
The DNAscent directory will appear in your current directory. Switch to the latest tagged version and compile the software by running:
```shell
cd DNAscent
-git checkout 2.0.2
+git checkout 3.0.2
make
```
This will put the DNAscent executable into the DNAscent/bin directory. A typical compile time for DNAscent and its dependencies is 5 minutes.
@@ -21,6 +21,7 @@ Please see the [documentation](https://dnascent.readthedocs.io) for detailed usa
## Citation
Please cite the following if you use DNAscent for your research:
+- Totanes FIG, Gockel J, Chapman SE, Bartfai R, Boemo MA, Merrick CJ. Replication origin mapping in the malaria parasite Plasmodium falciparum. bioRxiv. [[bioRxiv](https://doi.org/10.1101/2022.07.27.501677)]
- Boemo, MA. DNAscent v2: Detecting replication forks in nanopore sequencing data with deep learning. *BMC Genomics* 2021;22:430. [[Journal Link](https://doi.org/10.1186/s12864-021-07736-6)]
- Muller CA, Boemo MA, Spingardi P, Kessler BM, Kriaucionis S, Simpson JT, Nieduszynski CA. Capturing the dynamics of genome replication on individual ultra-long nanopore sequence reads. *Nature Methods* 2019;16:429-436. [[Journal Link](https://www.nature.com/articles/s41592-019-0394-y)]
diff --git a/dnn_models/BrdU_detect.pb b/dnn_models/BrdU_detect.pb
deleted file mode 100644
index 1594a61..0000000
Binary files a/dnn_models/BrdU_detect.pb and /dev/null differ
diff --git a/dnn_models/detect_model_BrdUEdU/saved_model.pb b/dnn_models/detect_model_BrdUEdU/saved_model.pb
new file mode 100644
index 0000000..5976c64
Binary files /dev/null and b/dnn_models/detect_model_BrdUEdU/saved_model.pb differ
diff --git a/dnn_models/detect_model_BrdUEdU/variables/variables.data-00000-of-00001 b/dnn_models/detect_model_BrdUEdU/variables/variables.data-00000-of-00001
new file mode 100644
index 0000000..a44a069
Binary files /dev/null and b/dnn_models/detect_model_BrdUEdU/variables/variables.data-00000-of-00001 differ
diff --git a/dnn_models/detect_model_BrdUEdU/variables/variables.index b/dnn_models/detect_model_BrdUEdU/variables/variables.index
new file mode 100644
index 0000000..08a6f91
Binary files /dev/null and b/dnn_models/detect_model_BrdUEdU/variables/variables.index differ
diff --git a/dnn_models/forkSense.pb b/dnn_models/forkSense.pb
deleted file mode 100644
index 1fc959a..0000000
Binary files a/dnn_models/forkSense.pb and /dev/null differ
diff --git a/docs/source/base.rst b/docs/source/base.rst
new file mode 100644
index 0000000..9f72094
--- /dev/null
+++ b/docs/source/base.rst
@@ -0,0 +1,46 @@
+.. DNAscent documentation master file, created by
+ sphinx-quickstart on Fri Feb 7 18:58:49 2020.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+DNAscent
+====================================
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Contents:
+
+ installation
+ index
+ detect
+ forkSense
+ visualisation
+ workflows
+ cookbook
+ releaseNotes
+
+Overview
+--------
+
+DNAscent is software designed to detect the modified bases BrdU and EdU in Oxford Nanopore reads. In an experimental setup where BrdU and EdU are incorporated into nascent DNA by replication forks, this software can be used to answer questions that were traditionally answered by DNA fibre analysis.
+
+At present, the only Oxford Nanopore flow cells supported by DNAscent are R9.4.1. The Flongle, MinION, GridION, and PromethION platforms are all supported.
+
+DNAscent is under active development by the `Boemo Group `_ based in the `Department of Pathology, University of Cambridge `_. We aim to push regular updates and improvements, and incorporating new functionality is an active area of our computational research.
+
+
+Publications
+------------
+
+If you use DNAscent for your research, please cite the following publications:
+
+Totanes FIG, Gockel J, Chapman SE, Bartfai R, Boemo MA, Merrick CJ. Replication origin mapping in the malaria parasite Plasmodium falciparum. [`bioRxiv `_]
+
+Boemo, MA DNAscent v2: Detecting replication forks in nanopore sequencing data with deep learning. BMC Genomics 2021;22:430. [`Journal DOI `_]
+
+Muller CA, Boemo MA, Spingardi P, Kessler, BM, Kriaucionis S, Simpson JT, Nieduszynski CA. Capturing the dynamics of genome replication on individual ultra-long nanopore sequence reads. Nature Methods 2019;16:429-436. [`Journal DOI `_]
+
+Bugs, Questions, and Comments
+-----------------------------
+
+Should any bugs arise or if you have any questions about usage, please raise a `GitHub issue `_. If you have comments or suggestions to improve the software or the documentation, please Email Michael Boemo at mb915@cam.ac.uk.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index fe98750..6949ed5 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -18,11 +18,11 @@
# -- Project information -----------------------------------------------------
project = 'DNAscent'
-copyright = '2020, Michael A. Boemo'
+copyright = '2022, Michael A. Boemo'
author = 'Michael A. Boemo'
# The full version, including alpha/beta/rc tags
-release = '2.0.2'
+release = '3.0.2'
# -- General configuration ---------------------------------------------------
@@ -41,7 +41,7 @@
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []
-master_doc = 'index'
+master_doc = 'base'
# -- Options for HTML output -------------------------------------------------
@@ -53,4 +53,4 @@
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = []
+html_static_path = ['_static']
diff --git a/docs/source/cookbook.rst b/docs/source/cookbook.rst
index f3dc53a..56e7702 100644
--- a/docs/source/cookbook.rst
+++ b/docs/source/cookbook.rst
@@ -29,73 +29,51 @@ The following barebones script parses the output of ``DNAscent detect``. We ite
strand = splitLine[4]
else:
posOnRef = int(splitLine[0])
- probBrdU = float(splitLine[1])
- sixMerOnRef = splitLine[2]
+ probEdU = float(splitLine[1])
+ probBrdU = float(splitLine[2])
+ sixMerOnRef = splitLine[3]
#add these values to a container or do some processing here
f.close()
-The following barebones script parses the output of ``DNAscent forkSense``. Note the similarity to the above script: All DNAscent output files were designed to have a very similar format to aid user processing.
-.. code-block:: python
-
- f = open('path/to/output.forkSense','r')
-
- for line in f:
-
- #ignore the header lines
- if line[0] == '#':
- continue
-
- #split the line into a list by whitespace
- splitLine = line.rstrip().split()
-
- if line[0] == '>':
-
- readID = splitLine[0][1:]
- chromosome = splitLine[1]
- refStart = int(splitLine[2])
- refEnd = int(splitLine[3])
- strand = splitLine[4]
- else:
- posOnRef = int(splitLine[0])
- probLeftFork = float(splitLine[1])
- probRightFork = float(splitLine[2])
-
- #add these values to a container or do some processing here
-
- f.close()
-
-And again for ``DNAscent regions``:
+The following example plots a histogram of fork track lengths from bed files generated by DNAscent forkSense.
.. code-block:: python
- f = open('path/to/output.regions','r')
+ import matplotlib
+ from matplotlib import pyplot as plt
- for line in f:
+ fnames = ['leftForks_DNAscent_forkSense.bed','rightForks_DNAscent_forkSense.bed']
- #ignore the header lines
- if line[0] == '#':
- continue
-
- #split the line into a list by whitespace
- splitLine = line.rstrip().split()
+ forkLengths = []
- if line[0] == '>':
+ for fn in fnames:
+ f = open(fn,'r')
- readID = splitLine[0][1:]
- chromosome = splitLine[1]
- refStart = int(splitLine[2])
- refEnd = int(splitLine[3])
- strand = splitLine[4]
- else:
- regionStart = int(splitLine[0])
- regionEnd = int(splitLine[1])
- regionScore = float(splitLine[2])
+ for line in f:
- #add these values to a container or do some processing here
+ #ignore the header lines
+ if line[0] == '#':
+ continue
+
+ #split the line into a list by whitespace
+ splitLine = line.rstrip().split()
+
+ lbound = int(splitLine[1])
+ rbound = int(splitLine[2])
+
+ forkLengths.append(rbound-lbound)
- f.close()
+ f.close()
+ plt.figure()
+ plt.hist(forkLengths)
+ plt.xlabel('Fork Track Length (bp)')
+ plt.ylabel('Count')
+ plt.savefig('forkTrackLen.pdf')
+ plt.close()
+
+
diff --git a/docs/source/detect.rst b/docs/source/detect.rst
index e77af6c..84d01ae 100644
--- a/docs/source/detect.rst
+++ b/docs/source/detect.rst
@@ -1,9 +1,9 @@
.. _detect:
-Detect
+detect
===============================
-``DNAscent detect`` is a ``DNAscent`` subprogram that goes through each read and, at each thymidine position, assigns the probability the thymidine is BrdU.
+``DNAscent detect`` is a ``DNAscent`` subprogram that analyses each nanopore-sequenced read and, at each thymidine position, assigns the probability that the base is really BrdU and EdU.
Usage
-----
@@ -21,15 +21,15 @@ Usage
-t,--threads number of threads (default is 1 thread),
--GPU use the GPU device indicated for prediction (default is CPU),
-q,--quality minimum mapping quality (default is 20),
- -l,--length minimum read length in bp (default is 100).
+ -l,--length minimum read length in bp (default is 1000).
The main input of ``DNAscent detect`` is an alignment (bam file) between the sequence fastq from Guppy and the organism's reference genome. This bam file should be sorted using ``samtools sort`` and indexed using ``samtools index`` so that there is a .bam.bai file in the same directory as the bam file. (Please see the example in :ref:`workflows` for details on how to do this.) The full path to the reference genome used in the alignment should be passed using the ``-r`` flag, and the index required by the ``-i`` flag is the file created using ``DNAscent index`` (see :ref:`index`).
-The number of threads is specified using the ``-t`` flag. ``DNAscent detect`` multithreads quite well by analysing a separate read on each thread, so multithreading is recommended. By default, the signal alignments and ResNet BrdU predictions are run on CPUs. If a CUDA-compatible GPU device is specified using the ``--GPU`` flag, then the signal alignments will be run on CPUs using the threads specified with ``-t`` and the ResNet BrdU prediction will be run on the GPU. Your GPU device number can be found with the command ``nvidia-smi``. GPU use requires that CUDA and cuDNN are set up correctly on your system and that these libraries can be accessed. If they're not, DNAscent will default back to using CPUs.
+The number of threads is specified using the ``-t`` flag. ``DNAscent detect`` multithreads quite well by analysing a separate read on each thread, so multithreading is recommended. By default, the signal alignments and ResNet predictions are run on CPUs. If a CUDA-compatible GPU device is specified using the ``--GPU`` flag, then the signal alignments will be run on CPUs using the threads specified with ``-t`` and the ResNet BrdU and EdU predictions will be run on the GPU. Your GPU device number can be found with the command ``nvidia-smi``. GPU use requires that CUDA and cuDNN are set up correctly on your system and that these libraries can be accessed. If they're not, DNAscent will default back to using CPUs.
It is sometimes useful to only run ``DNAscent detect`` on reads that exceed a certain mapping quality or length threshold (as measured by the subsequence of the contig that the read maps to). In order to do this without having to filter the bam file, DNAscent provides the ``-l`` and ``-q`` flags. Any read in the bam file with a reference length lower than the value specificed with ``-l`` or a mapping quality lower than the value specified with ``-q`` will be ignored.
-Before calling BrdU in a read, ``DNAscent detect`` must first perform a fast event alignment (see https://www.biorxiv.org/content/10.1101/130633v2 for more details). Quality control checks are performed on these alignments, and if they're not passed, then the read fails and is ignored. Hence, the number of reads in the output file will be slightly lower than the number of input reads. Typical failure rates are about 5-10%, although this will vary slightly depending on the read length, the BrdU substitution rate, and the genome sequenced.
+Before calling BrdU and EdU in a read, ``DNAscent detect`` must first perform a fast event alignment (see https://www.biorxiv.org/content/10.1101/130633v2 for more details). Quality control checks are performed on these alignments, and if they're not passed, then the read fails and is ignored. Hence, the number of reads in the output file will be slightly lower than the number of input reads. Typical failure rates are about 5-10%, although this will vary slightly depending on the read length, the analogue substitution rate, and the genome sequenced.
Output
------
@@ -46,11 +46,12 @@ Output
#Mode CNN
#MappingQuality 20
#MappingLength 5000
- #SignalDilation 1.000000
- #Version 2.0.0
+ #SystemStartTime 09/06/2022 12:45:29
+ #Software /path/to/DNAscent
+ #Version 3.0.0
#Commit 4cf80a7b89bdf510a91b54572f8f94d3daf9b167
-You can easily access the header of any .detect file with ``head -11 /path/to/output.detect`` or, alternatively, ``grep '#' /path/to/output.detect``.
+You can easily access the header of any .detect file with ``head -12 /path/to/output.detect`` or, alternatively, ``grep '#' /path/to/output.detect``.
Below the header is data for each read. Note that everything in this output file orients to the reference genome in the 5' --> 3' direction. Each read starts with a line in the format:
@@ -64,15 +65,16 @@ These lines always begin with a greater-than (>) character. Therefore, an easy
* the read mapped between ``mappingStart`` and ``mappingEnd`` on ``contig``,
* ``strand`` either takes the value ``fwd``, indicating that the read mapped to the forward strand, or ``rev`` indicating that the read mapped to the reverse complement strand.
-The following shows an example for a read that to the reverse strand between 239248 and 286543 on chrII.
+The following shows an example for a read that to the reverse strand between 48490 and 53033 on chromosome 1.
.. code-block:: console
- >c602f23f-e892-42ba-8140-da949abafbdd chrII 239248 286543 rev
+ >0d64a203-81b5-4b6c-aa2f-67b20969a509 1 48490 53033 rev
-Below these "start of read" lines, each line corresponds to the position of a thymidine in that read. There are three tab-separated columns:
+Below these "start of read" lines, each line corresponds to the position of a thymidine in that read. There are four tab-separated columns:
* the coordinate on the reference,
+* probability that the thymidine is actually EdU,
* probability that the thymidine is actually BrdU,
* 6mer on the reference.
@@ -81,32 +83,29 @@ Consider the following examples:
.. code-block:: console
- >c6785e1f-10d2-49cb-8ca3-e8d48979001b chrXIII 74003 81176 rev
- 74010 0.012874 TCTCTA
- 74011 0.012428 CTCTAA
- 74014 0.016811 TAACGA
- 74017 0.013372 CGACCA
- 74018 0.013836 GACCAA
+ >a4ea2872-9cb6-4218-afad-905f79204eb1 14 992440 996846 rev
+ 992448 0.125751 0.131483 ATCTTA
+ 992450 0.082488 0.078428 CTTATA
+ 992451 0.070718 0.050604 TTATAA
+ 992453 0.062216 0.047409 ATAACA
+ 992456 0.056369 0.042582 ACATTA
+ 992457 0.046755 0.038603 CATTAA
+ 992459 0.056535 0.041545 TTAATA
-Here, we're looking at the sequence TCTCTAACGACCAA on the reference genome. Because this read maps to the reverse complement, a call is made at every A (instead of T) on the reference. If instead we looked at a read that mapped to the forward strand, an example would be:
+
+Here, we're looking at the sequence ATCTTATAACATTAATA on the reference genome. Because this read maps to the reverse complement, a call is made at every A (instead of T) on the reference. The low probabilities of BrdU and EdU indicate the shown region of this particular molecule is unlikely to have analouge incorporated in it.
-.. code-block:: console
-
- >5d10eb9a-aae1-4db8-8ec6-7ebb34d32575 chrXIII 72319 77137 fwd
- 72319 0.017496 TCGTTT
- 72322 0.029483 TTTCTG
- 72323 0.039008 TTCTGT
- 72324 0.031474 TCTGTG
- 72326 0.026997 TGTGAG
-
-In both of these output snippets, we see from the second column that the probability of BrdU is low (around a 1-3% chance of BrdU) so these few bases are likely from a BrdU-negative region of DNA. In contrast, here we see the start of a read that does contain BrdU, and accordingly, the probability of BrdU at some positions is much higher:
+If instead we looked at a read that mapped to the forward strand, an example would be:
.. code-block:: console
- >a4f36092-b4d5-47a9-813e-c22c3b477a0c chrXVI 899273 907581 fwd
- 899276 0.866907 TCAAAT
- 899281 0.947935 TCCACA
- 899300 0.014683 TGGGAG
- 899312 0.186812 TAACGG
- 899320 0.934850 TTATTG
+ >d1e97c0f-5de7-4249-a426-30d5b4334106 2 748326 761207 fwd
+ 748327 0.066334 0.084699 TTTAGA
+ 748328 0.045942 0.152147 TTAGAA
+ 748329 0.040509 0.187831 TAGAAA
+ 748336 0.028645 0.278245 TCGGAC
+ 748352 0.021041 0.922350 TCGAAT
+ 748357 0.017188 0.932314 TGTAAT
+ 748359 0.016415 0.921368 TAATAT
+Here, we have a few genomic positions with high probability BrdU calls, indicating that the shown region of this molecule may be BrdU-substituted.
diff --git a/docs/source/forkSense.rst b/docs/source/forkSense.rst
index a328403..b059329 100644
--- a/docs/source/forkSense.rst
+++ b/docs/source/forkSense.rst
@@ -3,7 +3,7 @@
forkSense
===============================
-``DNAscent forkSense`` is a ``DNAscent`` subprogram that provides a probability estimate at each thymidine that a leftward- or rightward-moving fork moved through that position during the BrdU pulse.
+``DNAscent forkSense`` is a ``DNAscent`` subprogram that interprets the pattern of BrdU and EdU incorporation on each molecule, segmenting the read to show where leftward- or rightward-moving forks were moving during the BrdU and EdU pulses.
Usage
-----
@@ -11,82 +11,82 @@ Usage
.. code-block:: console
To run DNAscent forkSense, do:
- DNAscent forkSense -d /path/to/BrdUCalls.detect -o /path/to/output.forkSense
+ DNAscent forkSense -d /path/to/BrdUCalls.detect -o /path/to/output.forkSense --order EdU,BrdU
Required arguments are:
-d,--detect path to output file from DNAscent detect,
- -o,--output path to output file for forkSense.
+ -o,--output path to output file for forkSense,
+ --order order in which the analogues were pulsed (EdU,BrdU or BrdU,EdU).
Optional arguments are:
-t,--threads number of threads (default: 1 thread),
+ --markAnalogues writes analogue incorporation locations to a bed file (default: off),
--markOrigins writes replication origin locations to a bed file (default: off),
--markTerminations writes replication termination locations to a bed file (default: off),
--markForks writes replication fork locations to a bed file (default: off).
-The only required input of ``DNAscent forkSense`` is the output file produced by ``DNAscent detect``. Note that the detect file must have been produced using the v2.0 ResNet algorithm; ``DNAscent forkSense`` is not compatible with legacy HMM-based detection.
+The required inputs of ``DNAscent forkSense`` are the output file produced by ``DNAscent detect``, a new output file name for ``DNAscent forkSense`` to write on, and the order in which the analogues were pulsed. In the example command above, the ``--order`` flag indicates that EdU was pulsed first, and BrdU was pulsed second. The order of the pulses is important for determining fork direction and differentiating between origins and termination sites, but no information about the pulse length is needed. Note that the detect file must have been produced using the >v3.0.0 ResNet algorithm; ``DNAscent forkSense`` is not compatible with legacy HMM-based detection. Note further that >v3.0.0 ``DNAscent forkSense`` is not back compatible with the previous BrdU-only protocol, as it relies on the incorporation of both BrdU and EdU to determine fork direction. Users with data from a BrdU-only pulse-chase protocol should use DNAscent v2.0.2.
-If the ``--markOrigins`` flag is passed, ``DNAscent forkSense`` will use detected leftward- and rightward-moving forks to infer the locations of fired replication origins and write these to a bed file called ``origins_DNAscent_forkSense.bed`` in the working directory. Likewise, if the ``--markTerminations`` flag is passed, termination sites will be recorded in a bed file called ``terminations_DNAscent_forkSense.bed``.
Output
------
-If ``--markOrigins`` and/or ``--markTerminations`` were used, the resulting bed files has one called origin (for origins_DNAscent_forkSense.bed) or termination site (for terminations_DNAscent_forkSense.bed) per line and, in accordance with bed format, have the following space-separated columns:
+Main Output File
+^^^^^^^^^^^^^^^^
-* chromosome name,
-* 5' boundary of the origin (or terminiation site),
-* 3' boundary of the origin (or terminiation site),
-* read header of the read that the call came from (similar to those in the output file of ``DNAscent detect``).
+``DNAscent forkSense`` will produce a human-readable output file with the name and location that you specified using the ``-o`` flag. Like the output of ``DNAscent detect``, this file starts with a short header:
-Note that the "resolution" of the calls (i.e., the third column minus the second column) will depend on your experimental setup. In synchronised early S-phase cells, this difference for origin calls is likely to be small as the leftward- and rightward-moving forks from a fired origin are nearby one another. In asynchronous or mid/late S-phase cells, the difference is likely to be larger as the forks from a single origin will have travelled some distance before the BrdU pulse. The bed files only specify the region between matching leftward- and rightward-moving forks. Any subsequent assumptions (such as assuming uniform fork speed and placing the origin in the middle of that region) are left to the user.
+.. code-block:: console
-The output of ``DNAscent forkSense`` is a file with similar formatting to that of ``DNAscent detect``. The format for the read headers is the same. From left to right, the tab-delimited columns indicate:
+ #DetectFile /path/to/DNAscent.detect
+ #Threads 1
+ #Compute CPU
+ #SystemStartTime 10/06/2022 13:04:33
+ #Software /path/to/DNAscent
+ #Version 3.0.0
+ #Commit b9598a9e5bfa5f8314f92ba0f4fed39be1aee0be
+ #EstimatedRegionBrdU 0.559506
+ #EstimatedRegionEdU 0.202767
-* the coordinate on the reference,
-* probability that a leftward-moving fork passed through that coordinate during a BrdU pulse,
-* probability that a rightward-moving fork passed through that coordinate during a BrdU pulse.
+The fields in this header are analagous to the header from ``DNAscent detect``, but it includes two additional lines with an estimate of the thymidine-to-BrdU substitution rate in BrdU-positive regions and an estimate of the thymidine-to-EdU substitution rate in EdU-positive regions. In the example above, approximately 56% of thymidines are substituted for BrdU in BrdU-positive regions.
-A low probability in both the second and third columns suggests it was unlikely that a fork passed through that position during the pulse.
+The rest of this file has similar formatting to that of ``DNAscent detect``. The format for the read headers is the same. From left to right, the tab-delimited columns indicate:
-The following example output shows the end of a read that was passed through by a leftward-moving fork:
+* the coordinate on the reference,
+* a Boolean (0 or 1) indicating whether that position is in an EdU-positive region,
+* a Boolean (0 or 1) indicating whether that position is in an BrdU-positive region.
+
+The following example output shows an example:
.. code-block:: console
- >22c8a674-ed0e-475f-9c54-cb185299d923 chrII 173332 210452 fwd
- 173339 0.687217 0.062620
- 173341 0.687217 0.062620
- 173342 0.687217 0.062620
- 173345 0.687217 0.062620
- 173347 0.687217 0.062620
- 173348 0.687217 0.062620
- 173349 0.743986 0.045767
- 173358 0.743986 0.045767
- 173375 0.743986 0.045767
- 173377 0.743986 0.045767
- 173378 0.743986 0.045767
- 173381 0.743986 0.045767
- 173382 0.806924 0.038138
- 173383 0.806924 0.038138
- 173387 0.806924 0.038138
- 173390 0.806924 0.038138
- 173392 0.806924 0.038138
- 173393 0.806924 0.038138
- 173398 0.846875 0.032027
- 173402 0.846875 0.032027
- 173404 0.846875 0.032027
- 173406 0.846875 0.032027
- 173407 0.846875 0.032027
- 173417 0.846875 0.032027
- 173418 0.906748 0.028587
- 173419 0.906748 0.028587
- 173423 0.906748 0.028587
- 173425 0.906748 0.028587
- 173426 0.906748 0.028587
- 173428 0.906748 0.028587
- 173441 0.909755 0.029341
- 173445 0.909755 0.029341
- 173446 0.909755 0.029341
- 173449 0.909755 0.029341
- 173450 0.909755 0.029341
- 173451 0.909755 0.029341
- 173454 0.907803 0.029983
+ >806d1f69-1054-4b74-8356-d935a282a22e 11 1089865 1130164 fwd
+ 1089873 0 0
+ 1089874 0 0
+ 1089877 0 0
+ 1089878 0 0
+ 1089879 0 0
+ 1089880 0 0
+ 1089882 0 0
+ 1089895 0 0
+ 1089899 0 0
+
+Only reads that have at least one BrdU-positive or EdU-positive segment are written to this file. Reads with no base analogue segments called on them are omitted from this file, as 0's everywhere across these reads is implied. Note that the format of this file has changed substantially from DNAscent v2.*. This design decision stems from a shift in the algorithm used, as well as the desire to avoid using excess disk space with redundant information.
+
+
+Bed Files
+^^^^^^^^^
+
+If the ``--markOrigins`` flag is passed, ``DNAscent forkSense`` will write the genomic region between matched leftward- and rightward-moving forks to a bed file called ``origins_DNAscent_forkSense.bed`` in the working directory. Likewise, if the ``--markTerminations`` flag is passed, the genomic region between leftward- and rightward-moving forks moving towards each other will be recorded in a bed file called ``terminations_DNAscent_forkSense.bed``. The flag ``--markAnalogues`` will create two separate bed files: one containing the genomic location of BrdU-positive segments, and another containing the genomic location of EdU-positive segments.
+
+If the ``--markForks`` flag is passed, two bed files will be created in the working directory. The genomic location of leftward- and rightward-moving forks will be written to separate bed files called ``leftForks_DNAscent_forkSense.bed`` and ``rightForks_DNAscent_forkSense.bed``.
+
+All output bed files have the following space-separated columns:
+
+* chromosome name,
+* 5' boundary of the origin (or terminiation site, or fork),
+* 3' boundary of the origin (or terminiation site, or fork),
+* read header of the read that the call came from (similar to those in the output file of ``DNAscent detect``).
+For origins and termination sites, the "resolution" of the calls (i.e., the third column minus the second column) will depend on your experimental setup. In synchronised early S-phase cells, the genomic distance between the 5' and 3' boundaries likely to be small for origins and large for termination sites, as the leftward- and rightward-moving forks should be together near the origin. In asynchronous or mid/late S-phase cells, the origin calls may appear to be a "lower'' resolution (i.e., larger differences between the 5' and 3' boundaries) as the forks from a single origin will have travelled some distance before the pulses. When both forks are together at an origin, the origin bed file will record the midpoint of the analogue segment for the analogue that was pulsed first.
+The bed files created by ``DNAscent forkSense`` can be opened directly with a genome browser.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index ee9f9d3..11d79fa 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,44 +1,27 @@
-.. DNAscent documentation master file, created by
- sphinx-quickstart on Fri Feb 7 18:58:49 2020.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
+.. _index:
-DNAscent
-====================================
+index
+===============================
-.. toctree::
- :maxdepth: 1
- :caption: Contents:
+``DNAscent index`` is a ``DNAscent`` subprogram that creates a map between Oxford Nanopore readIDs and fast5 files. This allows ``DNAscent detect`` to scan through bam files and pull out the relevant signal information for each read.
- installation
- index_subprog
- detect
- regions
- forkSense
- psl
- visualisation
- workflows
- cookbook
- releaseNotes
+Usage
+-----
-Overview
---------
+.. code-block:: console
-DNAscent is software designed to detect the modified base BrdU in Oxford Nanopore reads. In an experimental setup where BrdU is incorporated into nascent DNA by replication forks, this software can be used to answer questions that were traditionally answered by DNA fibre analysis.
+ To run DNAscent index, do:
+ DNAscent index -f /path/to/fast5Directory
+ Required arguments are:
+ -f,--files path to fast5 files,
+ -s,--sequencing-summary path to sequencing summary file Guppy.
+ Optional arguments are:
+ -o,--output output file name (default is index.dnascent),
+ --GridION account for the different sequencing summary format used by in-built GridION basecalling.
-DNAscent is under active development by the `Boemo Group `_ based in the `Department of Pathology, University of Cambridge `_. We aim to push regular updates and improvements, and incorporating new functionality is an active area of our computational research.
+The required inputs to ``DNAscent index`` are the full path to the top-level directory containing the sequencing run's fast5 files (passed using the ``-f`` flag) and the sequencing_summary.txt file from Guppy (located in the top-level Guppy output directory). Note that the sequencing_summary.txt file created by onboard GridION basecalling has a slightly different format from that of the summary file generated by Guppy, and this can be accounted for with the ``--GridION`` flag. The default behaviour of ``DNAscent index`` is to place a file called ``index.dnascent`` in the working directory. The name of this file can be overridden using the ``-o`` flag.
+Output
+-------
-Publications
-------------
-
-Please cite the following publication if you use DNAscent for your research:
-
-Boemo, MA. DNAscent v2: Detecting Replication Forks in Nanopore Sequencing Data with Deep Learning. BMC Genomics 2021;22:430. [`Journal DOI `_]
-
-Muller CA, Boemo MA, Spingardi P, Kessler, BM, Kriaucionis S, Simpson JT, Nieduszynski CA. Capturing the dynamics of genome replication on individual ultra-long nanopore sequence reads. Nature Methods 2019;16:429-436. [`Journal DOI `_]
-
-Bugs, Questions, and Comments
------------------------------
-
-Should any bugs arise or if you have any questions about usage, please raise a `GitHub issue `_. If you have comments or suggestions to improve the software or the documentation, please Email Michael Boemo at mb915@cam.ac.uk.
+``DNAscent index`` will put a file called ``index.dnascent`` in the current working directory (note that if you used the ``-o`` flag, then the file will have the name and location that you specified). This file will be needed as an input to ``DNAscent detect``.
diff --git a/docs/source/index_subprog.rst b/docs/source/index_subprog.rst
deleted file mode 100644
index 39213fe..0000000
--- a/docs/source/index_subprog.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-.. _index:
-
-Index
-===============================
-
-``DNAscent index`` is a ``DNAscent`` subprogram that creates a map between Oxford Nanopore readIDs and fast5 files. This allows ``DNAscent detect`` to scan through bam files and pull out the relevant signal information for each read.
-
-Usage
------
-
-.. code-block:: console
-
- To run DNAscent index, do:
- DNAscent index -f /path/to/fast5Directory
- Required arguments are:
- -f,--files path to fast5 files,
- -s,--sequencing-summary path to sequencing summary file Guppy.
- Optional arguments are:
- -o,--output output file name (default is index.dnascent),
- --GridION account for the different sequencing summary format used by in-built GridION basecalling.
-
-The first required input to ``DNAscent index`` is the full path to the top-level directory containing the sequencing run's fast5 files, passed using the ``-f`` flag. This will typically be the directory created with MinKNOW during sequencing. The second required input is the full path to the ``sequencing_summary.txt`` file, specified using the ``-s`` flag. This file is created by Guppy during basecalling and is located in the top level directory containing the Guppy-created fastq files. (Note that as of v2.0.3, providing the sequencing summary file is now required where it was optional in previous releases.) The default behaviour of ``DNAscent index`` is to place a file called ``index.dnascent`` in the working directory. The name of this file can be overridden using the ``-o`` flag. Note that the in-built version of Guppy on the GridION produces a sequencing summary file with a slightly different format than the version of Guppy available on the ONT Community website. Users can correct for this change of format from the GridION by adding the ``--GridION`` flag.
-
-Output
--------
-
-``DNAscent index`` will put a file called ``index.dnascent`` in the current working directory (note that if you used the ``-o`` flag, then the file will have the name and location that you specified). This file will be needed as an input to ``DNAscent detect``.
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 3ef6187..1694534 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -16,7 +16,7 @@ The DNAscent directory will appear in your current directory. Switch to the late
.. code-block:: console
cd DNAscent
- git checkout 2.0.2
+ git checkout 3.0.0
make
This will put the DNAscent executable into the DNAscent/bin directory. Compilation requires a version of gcc that supports C++14, and a typical compile time for DNAscent and all of its dependencies is 5-7 minutes.
@@ -32,35 +32,12 @@ Cloning the repository recursively (see above) will provide all the required dep
Please note that the high throughput sequencing library (htslib) requires bzlib and lzma for compression. While these are common on most systems, if you don't have these, apt-get lzma-dev, liblzma-dev, and libbz2-dev. In addition, pfasta requires libbsd on Linux.
-VBZ Fast5 Compression
----------------------
-
-In new versions of MinKNOW, the fast5 files are compressed with VBZ Compression (see https://github.com/nanoporetech/vbz_compression). To use DNAscent on these compressed fast5 files, do the following (N.B., we're assuming you don't have root permissions):
-
-#. Go to https://github.com/nanoporetech/vbz_compression/releases and download the plugin appropriate for your processor architecture. In this example, we'll use ``ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz``.
-
-#. Download and unpack the plugin:
-
- .. code-block:: console
-
- wget https://github.com/nanoporetech/vbz_compression/releases/download/v1.0.1/ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz
- tar -xf ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz
-
-#. Add the plugin to your path:
-
- .. code-block:: console
-
- export HDF5_PLUGIN_PATH=/full/path/to/ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/lib/plugin
-
-#. Run ``DNAscent detect`` as normal.
-
-
GPU Use
-------
-The ``DNAscent detect`` executable can make use of a GPU, although this is optional (see :ref:`detect`). DNAscent requires CUDA 10.0 and cuDNN 7.5, and information about these can be found at the following links:
+The ``DNAscent detect`` executable can make use of a GPU, although this is optional (see :ref:`detect`). DNAscent requires CUDA 11.0 and cuDNN 8.0, and information about these can be found at the following links:
* cuDNN: https://developer.nvidia.com/cudnn
-* CUDA: https://developer.nvidia.com/cuda-10.0-download-archive
+* CUDA: https://developer.nvidia.com/cuda-11.0-download-archive
Always discuss any installation or version changes with your system administrator.
diff --git a/docs/source/psl.rst b/docs/source/psl.rst
deleted file mode 100644
index fe93405..0000000
--- a/docs/source/psl.rst
+++ /dev/null
@@ -1,30 +0,0 @@
-.. _psl:
-
-psl
-===============================
-
-``DNAscent psl`` is a ``DNAscent`` subprogram that writes a psl file to visualise the output of ``DNAscent detect``.
-
-Usage
------
-
-.. code-block:: console
-
- To run DNAscent psl, do:
- DNAscent psl -d /path/to/DNAscentOutput.detect -r /path/to/reference.fasta -o /path/to/psl_prefix
- Required arguments are:
- -d,--detect path to output file from DNAscent detect,
- -r,--reference path to genome reference in fasta format,
- -o,--output path to output bed prefix.
- Optional arguments are:
- --threshold probability above which a BrdU call is considered positive (default: 0.8),
- --min minimum read length to compute (default is 1),
- --max maximum read length to compute (default is Inf).
-
-The output file from ``DNAscent detect`` should be passed using the ``-d`` flag, and the reference genome used in the alignment should be passed with the ``-r`` flag.
-
-
-Output
-------
-
-The output is a psl file with each positive BrdU call marked as a tick. These files can then be opened in IGV or the UCSC Genome Browser to visualise positive BrdU calls genome-wide. Note that psl tracks are only plotted from the location of the first tick, so in order to visualise the portions of each read before the first BrdU call and after the last BrdU call, a placeholder tick is placed at the first and last coordinate of each read.
diff --git a/docs/source/regions.rst b/docs/source/regions.rst
deleted file mode 100644
index f1a09f8..0000000
--- a/docs/source/regions.rst
+++ /dev/null
@@ -1,57 +0,0 @@
-.. _regions:
-
-Regions
-===============================
-
-``DNAscent regions`` is a ``DNAscent`` subprogram that interprets the output of ``DNAscent detect`` to call regions of high and low BrdU incorporation.
-
-Note that as of v2.0, ``DNAscent regions`` has been largely superceded by ``DNAscent forkSense`` and the increased accuracy of ``DNAscent detect`` makes visualising BrdU incorporation in regions mostly unnecessary. However, it is still included to avoid breaking legacy workflows, and it does still have some uses as explained below.
-
-Usage
------
-
-.. code-block:: console
-
- To run DNAscent regions, do:
- DNAscent regions -d /path/to/DNAscentOutput.detect -o /path/to/DNAscentOutput.regions
- Required arguments are:
- -d,--detect path to output file from DNAscent detect,
- -o,--output path to output directory for bedgraph files.
- Optional arguments (if used with default ResNet-based detect) are:
- -r,--resolution number of thymidines in a region (default is 10).
- Optional arguments (if used with HMM-based detect) are:
- --threshold probability above which a BrdU call is considered positive (default: 0.8),
- -c,--cooldown minimum gap between positive analogue calls (default: 4),
- -r,--resolution minimum length of regions (default is 100 bp),
- -p,--probability override probability that a thymidine 6mer contains a BrdU (default: automatically calculated),
- -z,--zScore override zScore threshold for BrdU call (default: automatically calculated).
-
-The only required input of ``DNAscent regions`` is the output file produced by ``DNAscent detect``. ``DNAscent regions`` will first look through the detect file and determine the approximate fraction of thymidines replaced by BrdU in BrdU-positive regions. Using this probability, a z-score is assigned to each window (100 bp wide by default, but this can be changed using the ``-r`` flag) to indicate whether there is more or less BrdU than would be expected for an average BrdU-positive region. Naturally, some regions will be BrdU-positive but will have a substitution rate lower than average for BrdU-positive regions. Hence, ``DNAscent regions`` determines an appropriate boundary threshold between BrdU-positive regions and thymidine regions and rescales all of the z-scores so that this boundary is 0. ``DNAscent regions`` will calculate these values for you, but they can be overridden with the ``-p`` and ``-z`` flags, though this is generally not recommended. The exceptions are runs with 0% BrdU or runs where a high BrdU incorporation is expected along the entirety of each read. This is because these parameters are computed assuming that there are two populations (BrdU-positive and thymidine-only segments of DNA).
-
-In order to determine regions of high and low BrdU incorporation, ``DNAscent regions`` needs to count positive BrdU calls. By default, a thymidine is considered to be BrdU if it was scored with a probability higher than 0.8 by ``DNAscent detect``. This value was tuned in-house to optimise signal-to-noise, but it can be changed with the ``--threshold`` flag. Likewise, some care has to be given to how positive calls are counted, as BrdU can sometimes shift the signal of neighbouring thymidines. To prevent artefacts from overcounting while minimising undercounting, the default behaviour is to only make a positive call at most every 4 bases, though this can be changed with the ``-c`` flag.
-
-
-Output
-------
-
-The output of DNAscent regions is a file with similar formatting to that of ``DNAscent detect``. The format for the read headers is the same. From left to right, the tab-delimited columns indicate:
-
-* the start of the region,
-* the end of the region,
-* the z-score,
-* the string "BrdU" if the score is positive and "Thym" if the score is negative.
-
-A large positive z-score indicates high BrdU incorporation in that region, and a large negative score indicates very little BrdU incorporation in that region. An example output is as follows:
-
-.. code-block:: console
-
- >bfdc06e0-001f-41f7-bbea-f2f6785a3860 chrI 0 28066 fwd
- 62 167 -2.38086 Thym
- 173 276 -2.38086 Thym
- 283 388 -2.27466 Thym
- 393 499 -2.00741 Thym
- 501 605 -2.00741 Thym
- 606 708 -2.48397 Thym
- 713 817 -2.27466 Thym
-
-Note that the region width may sometimes vary slightly from the value specified. The region width is designated as the coordinate of the first thymidine greater than the window width (100 bp by default) from the starting coordinate. In order to guard against assigning a score to regions with very few thymidines, ``DNAscent regions`` will also extend the region until at least 10 calls are considered.
diff --git a/docs/source/releaseNotes.rst b/docs/source/releaseNotes.rst
index b986384..90e07b4 100644
--- a/docs/source/releaseNotes.rst
+++ b/docs/source/releaseNotes.rst
@@ -3,7 +3,18 @@
Release Notes
===============================
-v2.0.2
+v3.0.2
+-----------------
+
+* ``DNAscent detect`` now detects two different thymidine analogues, BrdU and EdU, in the same molecule,
+* ``DNAscent forkSense`` now uses the spatial patterning of EdU and BrdU to determine fork direction as in DNA fibre,
+* dnascent2bedgraph utility updated to plot both EdU and BrdU tracks in genome browsers,
+* ``DNAscent regions`` is now deprecated and has been fully superceded by ``DNAscent forkSense``,
+* ``DNAscent psl`` is now deprecated as reads can be more comprehensively plotted using the dnascent2bedgraph utility,
+* Migration from Tensorflow 1.14 to 2.4.1 and, correspondingly, GPU usage now requires CUDA 11 and cuDNN 8,
+* Released with `Totanes FIG, Gockel J, Chapman SE, Bartfai R, Boemo MA, Merrick CJ. Replication origin mapping in the malaria parasite Plasmodium falciparum. bioRxiv `_.
+
+v2.0.0
-----------------
* Migration from HMM-based BrdU detection at every thymidine to ResNet-based detection at every thymidine,
@@ -11,9 +22,9 @@ v2.0.2
* Support for BrdU detection on GPUs,
* ``DNAscent forkSense`` to call replication origins and termination sites in both synchronously and asynchronously replicating cells at any point in S-phase,
* ``DNAscent align`` to align nanopore signals to reference,
-* Significant increases to replication origin calling accuracy and sensitivity,
+* Significant increases to replication origin calling accuracy,
* Visualisation utility for plotting output of multiple DNAscent executables as bedgraphs,
-* Released with `Boemo, MA. DNAscent v2: Detecting Replication Forks in Nanopore Sequencing Data with Deep Learning. bioRxiv 2020 `_.
+* Released with `Boemo, MA. DNAscent v2: Detecting replication forks in nanopore sequencing data with deep learning. BMC Genomics 2021;22:430 `_.
v1.0.0
-----------------
@@ -22,7 +33,7 @@ v1.0.0
* Improvements to BrdU detection accuracy,
* ``DNAscent train`` to train Guassian mixture models from nanopolish eventalign.
-v0.1.0
+v0.1
-----------------
* HMM-based BrdU detection at ~160 thymidine-containing 6mers,
diff --git a/docs/source/visualisation.rst b/docs/source/visualisation.rst
index 60e8bc0..216dfea 100644
--- a/docs/source/visualisation.rst
+++ b/docs/source/visualisation.rst
@@ -3,28 +3,28 @@
Visualisation
===============================
-DNAscent supports multilevel analysis: We want users to be able to see the fork calls made by ``DNAscent forkSense`` and visualise them alongside individual base-pair resolution BrdU calls by ``DNAscent detect`` in order to see why these calls are being made. To that end, we include a visualisation utility in ``DNAscent/utils`` that formats the output of DNAscent executables (detect, regions, and forkSense) into bedgraphs that can be visualised with IGV or the UCSC Genome Browser. You can supply this utility with the output from one, two, or all three of these executables. If more than one is specified, the utility organises the bedgraphs so that the tracks for each read are grouped together.
+DNAscent supports multilevel analysis: We want users to be able to see the fork calls made by ``DNAscent forkSense`` and visualise them alongside the individual base-pair resolution BrdU and EdU calls by ``DNAscent detect`` in order to see why these calls are being made. To that end, we include a visualisation utility in ``DNAscent/utils`` that formats the output of DNAscent executables (detect and forkSense) into bedgraphs that can be visualised with IGV or the UCSC Genome Browser. You can supply this utility with the output from one or two of these executables. If more than one is specified, the utility organises the bedgraphs so that the tracks for each read are grouped together.
Usage
-----
.. code-block:: console
- dnascent2bedgraph.py: Converts the output of DNAscent detect, regions, and forkSense into bedgraphs.
+ dnascent2bedgraph.py: Converts the output of DNAscent detect and forkSense into bedgraphs.
To run dnascent2bedgraph.py, do:
python dnascent2bedgraph.py [arguments]
Example:
python dnascent2bedgraph.py -d /path/to/dnascentDetect.out -f /path/to/dnascentForksense.out -o /path/to/newBedgraphDir -n 1000 --minLength 10000
Required arguments are at least one of the following:
-d,--detect path to DNAscent detect output file,
- -f,--forkSense path to DNAscent forkSense output file,
- -r,--regions path to DNAscent regions output file.
+ -f,--forkSense path to DNAscent forkSense output file.
Required argument is:
-o,--output output directory which will be created.
Optional arguments are:
--minLength only convert reads with specified minimum read length (in base pairs) into bedgraphs (default: 1),
--maxLength only convert reads with specified maximum read length (in base pairs) into bedgraphs (default: Inf),
-n,--maxReads maximum number of reads to convert into bedgraphs (default: Inf),
+ --targets forkSense bed file with specific reads to plot,
--filesPerDir maximum reads per subdirectory (default: 300).
A further example of how to use ``dnascent2bedgraph`` is given in :ref:`workflows`.
@@ -32,4 +32,4 @@ A further example of how to use ``dnascent2bedgraph`` is given in :ref:`workflow
Output
------
-``dnascent2bedgraph`` will create the directory you specified using the ``-o`` flag which will contain integer-numbered subdirectories. Each of these subdirectories will contain the bedgraphs for the number of reads specified by ``--filesPerDir`` (default is 300). If the output of more than one DNAscent executable was specified using the ``-d``, ``-f``, and ``-r`` flags, then the bedgraphs for each read will be grouped together so that they appear in IGV as consecutive tracks.
+``dnascent2bedgraph`` will create the directory you specified using the ``-o`` flag which will contain integer-numbered subdirectories. Each of these subdirectories will contain the bedgraphs for the number of reads specified by ``--filesPerDir`` (default is 300). If the output of more than one DNAscent executable was specified using the ``-d`` and ``-f`` flags, then the bedgraphs for each read will be grouped together so that they appear in IGV as consecutive tracks. Rather than plotting bedgraphs for every read in the sequencing run, it is sometimes useful to plot bedgraphs of just those reads that have a replication feature (e.g., an origin) called on them. When a bed file of origin calls from DNAscent forkSense is passed using the ``--targets`` flag, dnascent2bedgraph will only write bedgraphs of the reads in the bed file.
diff --git a/docs/source/workflows.rst b/docs/source/workflows.rst
index 73375e1..38b9570 100644
--- a/docs/source/workflows.rst
+++ b/docs/source/workflows.rst
@@ -6,7 +6,7 @@ Workflow
The following is a full DNAscent workflow, where we'll start off after Guppy has finished running (users that need help with Guppy should refer to the `Oxford Nanopore webpages `_). In particular, we assume the following:
* you have a directory of 1D R9.5 or R9.4.1 450bp/s Oxford Nanopore fast5 reads (which may be in subdirectories) that you want to use for detection,
-* these reads have been basecalled to fastq format using Albacore or Guppy (available from Oxford Nanopore),
+* these reads have been basecalled to fastq format using Guppy (available from Oxford Nanopore),
* you have a reference/genome file (in fasta format) for your reads.
Example Workflow
@@ -18,21 +18,22 @@ Download and compile DNAscent:
git clone --recursive https://github.com/MBoemo/DNAscent.git
cd DNAscent
- git checkout 2.0.2
+ git checkout 3.0.2
make
- cd ..
Concatenate the fastq files from Guppy:
.. code-block:: console
- cat /path/to/GuppyOutDirectory/*.fastq > reads.fastq
+ cat /path/to/GuppyOutDirectory/pass/*.fastq /path/to/GuppyOutDirectory/fail/*.fastq > reads.fastq
+
+Note that we recommend running DNAscent on reads that have passed and failed Guppy's QCs, hence concatenating them into a single fastq file above. Analogue-substituted reads (particularly if they are heavily substituted) are predisposed to failing Guppy's QCs, so only running DNAscent on Guppy's passed reads can disproportionately throw out the reads you are most interested in. DNAscent will do its own analogue-aware QCs at the ``DNAscent detect`` stage.
Align the reads with `minimap2 `_ and sort with `samtools `_:
.. code-block:: console
- minimap2 -ax map-ont -o alignment.sam /path/to/reference.fasta reads.fastq
+ minimap2 -L -ax map-ont -o alignment.sam /path/to/reference.fasta reads.fastq
samtools view -Sb -o alignment.bam alignment.sam
samtools sort alignment.bam alignment.sorted
samtools index alignment.sorted.bam
@@ -55,6 +56,7 @@ Alternatively, if the system has a CUDA-compatible GPU in it, we can run ``nvidi
.. code-block:: console
+ Thu Aug 20 21:06:57 2020
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06 Driver Version: 450.51.06 CUDA Version: 11.0 |
|-------------------------------+----------------------+----------------------+
@@ -83,29 +85,38 @@ From this, we can see that the GPU's device ID is 0 (just to the left of Tesla)
Note that we're assuming the CUDA libraries for the GPU have been set up properly (see :ref:`installation`). If these libraries can't be accessed, DNAscent will splash a warning saying so and default back to using CPUs.
-When ``DNAscent detect`` is finished, it will should put a file called ``output.detect`` in the current directory. We can look at the individual positive BrdU calls with ``DNAscent psl``. Let's create a psl file that shows any position where BrdU is called at 0.7 probability or higher:
+When ``DNAscent detect`` is finished, there will be a file called ``output.detect`` in the current directory. At this point, we can make bedgraphs out of the ``DNAscent detect`` output (see :ref:`visualisation`) which can also be loaded into IGV or the UCSC Genome Browser.
+
+Lastly, we can run ``DNAscent forkSense`` on the output of ``DNAscent detect`` to measure replication fork movement. Suppose that in our experimental protocol, we pulsed BrdU first followed by EdU. Let's run it on four threads and specify that we want it to keep track of replication origins, forks, and termination sites:
.. code-block:: console
- DNAscent psl -d output.detect -r /full/path/to/reference.fasta -o output --threshold 0.7
+ DNAscent forkSense -d output.detect -o output.forkSense -t 4 --markOrigins --markTerminations --markForks --order BrdU,EdU
+
+This will make the following files:
-The resulting file ``output.psl`` can be loaded into IGV or the UCSC Genome Browser. At this point, we can make bedgraphs out of the ``DNAscent detect`` output (see :ref:`visualisation`) which can also be loaded into IGV or the UCSC Genome Browser.
+* origins_DNAscent_forkSense.bed (with our origin calls),
+* terminations_DNAscent_forkSense.bed (with our termination calls),
+* two bed files (leftForks_DNAscent_forkSense.bed, rightForks_DNAscent_forkSense.bed) with our fork calls,
+* output.forkSense.
-Lastly, we can run ``DNAscent forkSense`` on the output of ``DNAscent detect`` to measure replication fork movement. Let's run it on four threads and specify that we want it to keep track of replication origins, forks, and termination sites:
+We can load the bed files directly into IGV to see where origins, forks, and terminiations were called in the genome.
+
+We can visualise (see :ref:`visualisation`) output.forkSense by turning them into bedgraphs:
.. code-block:: console
- DNAscent forkSense -d output.detect -o output.forkSense -t 4 --markOrigins --markTerminations --markForks
+ python dnascent2bedgraph.py -d output.detect -f output.forkSense -o newBedgraphDirectory
-This will make three files: origins_DNAscent_forkSense.bed (with our origin calls), terminations_DNAscent_forkSense.bed (with our termination calls), and output.forkSense. We can load the two bed files directly into IGV to see where origins and terminiations were called in the genome.
+This will create a new directory called ``newBedgraphDirectory``. By passing both a ``forkSense`` and ``detect`` file to dnascent2bedgraph.py, the utility will convert them both into bedgraphs and organise them so that for each read, we can see the single-nt BrdU and EdU detection output from ``DNAscent detect`` right next to the left- and rightward-moving fork probabilities from ``DNAscent forkSense``. These bedgraphs can then be loaded into IGV or the UCSC Genome Browser.
-We can visualise (see :ref:`visualisation`) the first 1500 reads of output.forkSense by turning them into bedgraphs:
+Perhaps, however, we are only interested in viewing reads with origin calls on them. In this case, we can use the bed file generated above (origins_DNAscent_forkSense.bed) to specify that we only want bedgraphs of reads with origin calls on them.
.. code-block:: console
- python dnascent2bedgraph.py -d output.detect -f output.forkSense -o newBedgraphDirectory -n 1500
-
-This will create a new directory called ``newBedgraphDirectory``. By passing both a ``forkSense`` and ``detect`` file to dnascent2bedgraph.py, the utility will convert them both into bedgraphs and organise them so that for each read, we can see the bp-resolution BrdU detection output from ``DNAscent detect`` right next to the left- and rightward-moving fork probabilities from ``DNAscent forkSense``. These bedgraphs can then be loaded into IGV or the UCSC Genome Browser.
+ python dnascent2bedgraph.py -d output.detect -f output.forkSense -o newBedgraphDirectory --targets origins_DNAscent_forkSense.bed
+
+This strategy works equally well for any of the bed files generated by DNAscent forkSense.
Barcoding
---------
diff --git a/src/DNAscent.cpp b/src/DNAscent.cpp
index 370bd51..a31bb1f 100755
--- a/src/DNAscent.cpp
+++ b/src/DNAscent.cpp
@@ -1,5 +1,5 @@
//----------------------------------------------------------
-// Copyright 2019-2020 University of Oxford
+// Copyright 2019 University of Oxford
// Written by Michael A. Boemo (mb915@cam.ac.uk)
// This software is licensed under GPL-3.0. You should have
// received a copy of the license with this software. If
@@ -11,8 +11,6 @@
#include