stage 4.0.2

MBoemo · Jul 23, 2024 · 884bef1 · 884bef1
1 parent 286d5ff
commit 884bef1
Show file tree

Hide file tree

Showing 17 changed files with 123 additions and 198 deletions.
diff --git a/DNAscent.def b/DNAscent.def
@@ -0,0 +1,54 @@
+Bootstrap: docker
+From: nvidia/cuda:11.1.1-cudnn8-devel-ubuntu18.04
+
+%labels
+    Version v4.0.2
+
+%help
+    DNAscent is software for detecting regions of BrdU and EdU incorporation in Oxford Nanopore reads. 
+    Source: https://github.com/MBoemo/DNAscent
+    Documentation: https://dnascent.readthedocs.io/en/latest/?badge=latest
+    Web: https://www.boemogroup.org/
+    Please submit any bugs to https://github.com/MBoemo/DNAscent/issues.
+
+%post
+
+    # Install system packages
+    apt-get update && apt-get install -y \
+        lzma-dev \
+        liblzma-dev \
+        libbz2-dev \
+        libbsd-dev \
+        git \
+        build-essential \
+        wget
+
+    # Clone and compile DNAscent
+    mkdir -p /app
+    cd app
+    git clone --recursive https://github.com/MBoemo/DNAscent.git
+    cd DNAscent
+    make
+
+    # Install vbz plugin
+    cd /
+    mkdir -p /plugin
+    cd plugin
+    wget https://github.com/nanoporetech/vbz_compression/releases/download/v1.0.1/ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz
+    tar -xf ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz
+
+%environment
+
+    # cuda paths
+    export CUDA_HOME=/usr/local/cuda
+    export CPATH=/usr/local/cuda/include:$CPATH
+    export CUDA_PATH=/usr/local/cuda
+    export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+    export PATH=/usr/local/cuda/bin:$PATH
+
+    # vbz plugin path
+    export HDF5_PLUGIN_PATH=/plugin/ont-vbz-hdf-plugin-1.0.1-Linux/usr/local/hdf5/lib/plugin
+
+%runscript
+    exec /app/DNAscent/bin/DNAscent "$@"
diff --git a/Makefile b/Makefile
@@ -60,8 +60,8 @@ tensorflow/include/tensorflow/c/c_api.h:
 	if [ ! -e tensorflow/include/tensorflow/c/c_api.h ]; then \
 		mkdir tensorflow; \
 		cd tensorflow; \
-		wget https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-linux-x86_64-2.12.0.tar.gz; \
-		tar -xzf libtensorflow-gpu-linux-x86_64-2.12.0.tar.gz || exit 255; \
+		wget https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-linux-x86_64-2.4.1.tar.gz; \
+		tar -xzf libtensorflow-gpu-linux-x86_64-2.4.1.tar.gz || exit 255; \
 		cd ..; \
 	fi 
 

diff --git a/dnn_models/detect_model_BrdUEdU_DNAr10_4_1/fingerprint.pb b/dnn_models/detect_model_BrdUEdU_DNAr10_4_1/fingerprint.pb
diff --git a/dnn_models/detect_model_BrdUEdU_DNAr10_4_1/keras_metadata.pb b/dnn_models/detect_model_BrdUEdU_DNAr10_4_1/keras_metadata.pb
diff --git a/dnn_models/detect_model_BrdUEdU_DNAr10_4_1/saved_model.pb b/dnn_models/detect_model_BrdUEdU_DNAr10_4_1/saved_model.pb
diff --git a/dnn_models/detect_model_BrdUEdU_DNAr10_4_1/variables/variables.data-00000-of-00001 b/dnn_models/detect_model_BrdUEdU_DNAr10_4_1/variables/variables.data-00000-of-00001
diff --git a/dnn_models/detect_model_BrdUEdU_DNAr10_4_1/variables/variables.index b/dnn_models/detect_model_BrdUEdU_DNAr10_4_1/variables/variables.index
diff --git a/docs/source/detect.rst b/docs/source/detect.rst
@@ -48,7 +48,7 @@ Output
    #MappingLength 5000
    #SystemStartTime 09/02/2024 12:45:29
    #Software /path/to/DNAscent
-   #Version 4.0.1
+   #Version 4.0.2
    #Commit 4cf80a7b89bdf510a91b54572f8f94d3daf9b167
 
 You can easily access the header of any .detect file with ``head -11 /path/to/output.detect`` or, alternatively, ``grep '#' /path/to/output.detect``.

diff --git a/docs/source/forkSense.rst b/docs/source/forkSense.rst
@@ -44,7 +44,7 @@ Main Output File
    #Compute CPU
    #SystemStartTime 10/02/2024 13:04:33
    #Software /path/to/DNAscent
-   #Version 4.0.1
+   #Version 4.0.2
    #Commit b9598a9e5bfa5f8314f92ba0f4fed39be1aee0be
    #EstimatedRegionBrdU 0.559506
    #EstimatedRegionEdU 0.202767

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -25,7 +25,7 @@ Overview
 
 DNAscent is software designed to detect the base analogues BrdU and EdU in Oxford Nanopore reads.  In an experimental setup where BrdU and EdU are incorporated into nascent DNA by replication forks, this software can be used to answer questions that were traditionally answered by DNA fibre analysis.  DNAscent can also call the genomic positions of stalled and stressed replication forks for use as a replication stress assay.
 
-DNAscent v4.0.1 supports sequencing data collected on Oxford Nanopore R10.4.1 flow cells. Users wishing to analyse data acquired on legacy R9.4.1 flow cells should roll back to DNAscent v3.1.2 as v4.0.1 is not back-compatible with R9.4.1 flow cells. As R9.4.1 flow cells have been deprecated by Oxford Nanopore,
+DNAscent v4.0.2 supports sequencing data collected on Oxford Nanopore R10.4.1 flow cells. Users wishing to analyse data acquired on legacy R9.4.1 flow cells should roll back to DNAscent v3.1.2 as v4.0.2 is not back-compatible with R9.4.1 flow cells. As R9.4.1 flow cells have been deprecated by Oxford Nanopore,
 previous versions of DNAscent designed for R9.4.1 flow cells (v3.1.2 and below) are no longer under active development.
 
 The Oxford Nanopore Flongle, MinION, GridION, and PromethION platforms are all supported.

diff --git a/docs/source/index_exe.rst b/docs/source/index_exe.rst
@@ -3,7 +3,7 @@
 index
 ===============================
 
-``DNAscent index`` is a ``DNAscent`` subprogram that creates a map between Oxford Nanopore readIDs and fast5 files.  This allows ``DNAscent detect`` to scan through bam files and pull out the relevant signal information for each read.
+``DNAscent index`` is a ``DNAscent`` subprogram that creates a map between Oxford Nanopore readIDs and FAST5 files.  This allows ``DNAscent detect`` to scan through bam files and pull out the relevant signal information for each read.
 
 Usage
 -----
@@ -16,10 +16,9 @@ Usage
      -f,--files                path to fast5 files.
      -s,--sequencing-summary   path to sequencing summary file Guppy.
    Optional arguments are:
-     -o,--output               output file name (default is index.dnascent),
-        --GridION              account for the different sequencing summary format used by in-built GridION basecalling.
+     -o,--output               output file name (default is index.dnascent).
 
-The required inputs to ``DNAscent index`` are the full path to the top-level directory containing the sequencing run's fast5 files (passed using the ``-f`` flag) and the path to the ``sequencing_summary.txt`` file (specified using the ``-s`` flag).  
+The required inputs to ``DNAscent index`` are the full path to the top-level directory containing the sequencing run's FAST5 files (passed using the ``-f`` flag) and the path to the ``sequencing_summary.txt`` file (specified using the ``-s`` flag).  
 ``sequencing_summary.txt`` is created by Guppy during basecalling and is located in the top level directory containing the Guppy-created fastq files.  
 The default behaviour of ``DNAscent index`` is to place a file called ``index.dnascent`` in the working directory.  The name of this file can be overridden using the ``-o`` flag.
 

diff --git a/docs/source/installation.rst b/docs/source/installation.rst
@@ -1,8 +1,28 @@
 .. _installation:
 
-Download & Installation
+Getting Started
 ===============================
 
+
+Singularity
+---------------------
+
+We recommend running DNAscent using one of our supported Singularity images. These images contain all necessary dependencies including TensorFlow, CUDA, CuDNN, and compression plugins so that your system only needs a valid NVIDIA driver for GPU usage. If your system does not have Singularity installed, instructions are available `here <https://docs.sylabs.io/guides/3.0/user-guide/installation.html>`_.
+
+.. code-block:: console
+
+   singularity pull DNAscent.sif library://mboemo/DNAscent/DNAscent:4.0.2
+   
+You can run DNAscent from the image by passing the desired executable and arguments. The following example shows how to run DNAscent :ref:`detect`:
+
+.. code-block:: console
+
+   singularity run --nv DNAscent.sif detect -b /path/to/alignment.bam -r /path/to/reference.fasta -i /path/to/index.dnascent -o /path/to/output.detect
+
+
+Building from Source
+---------------------
+
 Clone the DNAscent repository with the recursive flag so that the dependencies are cloned as well.
 
 .. code-block:: console
@@ -14,7 +34,7 @@ The DNAscent directory will appear in your current directory. Switch to the late
 .. code-block:: console
 
    cd DNAscent
-   git checkout 4.0.1
+   git checkout 4.0.2
    make
 
 This will put the DNAscent executable into the DNAscent/bin directory. Compilation requires a version of gcc that supports C++14, and a typical compile time for DNAscent and all of its dependencies is 5-7 minutes.
@@ -28,12 +48,9 @@ Cloning the repository recursively (see above) will provide all the required dep
 * tinydir (https://github.com/cxong/tinydir.git)
 * TensorFlow (https://www.tensorflow.org/install/lang_c)
 
-Please note that the high throughput sequencing library (htslib) requires bzlib and lzma for compression. While these are common on most systems, if you don't have these, apt-get lzma-dev, liblzma-dev, and libbz2-dev. In addition, pfasta requires libbsd on Linux.
-
-VBZ Fast5 Compression
----------------------
+Please note that the high throughput sequencing library (htslib) requires bzlib and lzma for compression. While these are common on most systems, if you don't have these, apt-get lzma-dev, liblzma-dev, and libbz2-dev. In addition, pfasta requires libbsd.
 
-In new versions of MinKNOW, the fast5 files are compressed with VBZ Compression (see https://github.com/nanoporetech/vbz_compression).  To use DNAscent on these compressed fast5 files, do the following (N.B., we're assuming you don't have root permissions):
+FAST5 files are compressed with VBZ Compression (see https://github.com/nanoporetech/vbz_compression).  To use DNAscent on these compressed FAST5 files, do the following (N.B., we're assuming you don't have root permissions):
 
 #. Go to https://github.com/nanoporetech/vbz_compression/releases and download the plugin appropriate for your processor architecture.  In this example, we'll use ``ont-vbz-hdf-plugin-1.0.1-Linux-x86_64.tar.gz``.
 
@@ -52,10 +69,7 @@ In new versions of MinKNOW, the fast5 files are compressed with VBZ Compression
 
 #. Run ``DNAscent detect`` as normal.
 
-GPU Use
--------
-
-The ``DNAscent detect`` executable can make use of a GPU, although this is optional (see :ref:`detect`).  DNAscent requires CUDA 11.8 and cuDNN 8.9. Information about these can be found at the following links:
+The ``DNAscent detect`` executable can make use of a GPU, although this is optional (see :ref:`detect`).  DNAscent requires CUDA 11.1 and cuDNN 8.0. Information about these can be found at the following links:
 
 * cuDNN: https://developer.nvidia.com/cudnn
 * CUDA: https://developer.nvidia.com/cuda-11.0-download-archive

diff --git a/docs/source/releaseNotes.rst b/docs/source/releaseNotes.rst
@@ -3,6 +3,16 @@
 Release Notes
 ===============================
 
+v4.0.2
+-----------------
+
+* The deep learning model is the same as in v4.0.1. Users should see nearly identical performance in BrdU and EdU calling.
+* Fork speed and stall calling has been fine-tuned from v4.0.1. Users should now see a closer match between v4.0.2 on R10 pores and that of v3.1.2 on R9 pores.
+* Starting with this release, we will be providing supported Singularity images here (https://cloud.sylabs.io/library/mboemo). This is now the recommended way to use and run DNAscent.
+* There was a longstanding issue of different basecallers producing slightly different sequencing summary formats which could trip up ``DNAscent index``. Previously, we provided various flags for ``DNAscent index`` to account for the different formats but this was more complicated than it needed to be. ``DNAscent index`` will now tune itself for whatever sequencing summary format you have.
+* v4.0.1 had an issue where one of the model layers was not properly optimised for GPU usage by TensorFlow 2.12.0. This was causing slow runtimes of ``DNAscent detect`` on certain GPUs. The issue is fixed in this release, although it required a rollback to TensorFlow 2.4.1 (at least for now). Part of our aim in releasing and supporting Singularity images was to mitigate any inconvenience that might have been caused by the need to change CUDA and CuDNN versions.
+* POD5 and Dorado support are still ongoing. While we missed our target for including them in this release, positive community feedback on v4.0.1 and deprecation of R9 flowcells meant that we wanted to move R10 support out of the pre-release stage and ship a fully supported release to address the above issues. Engineering on quality-of-life features remains ongoing.
+
 v4.0.1
 -----------------
 
@@ -12,7 +22,7 @@ This is a pre-release. It is possible (maybe even probable) that the deep learni
 * POD5 is not yet supported but POD5 support is planned for v4.0.2 which will be the LTS release. POD5 can be converted to FAST5 using the ``pod5 convert to_fast5`` utility (https://pypi.org/project/pod5/#pod5-convert-to_fast5).
 * DNAscent's deep learning models were trained on R10.4.1 flow cells with 5kHz sampling which is now the default sampling rate on Oxford Nanopore platforms. Using DNAscent with reads sequenced with the older 4kHz sampling rate is not recommended.
 * ``DNAscent index`` still uses the sequencing_summary.txt file from Guppy and legacy versions of Guppy are available on the Oxford Nanopore Community webpage. Compatibility with Dorado is planned for v4.0.2 LTS.
-* Tensorflow updated to 2.12.0 and, correspondingly, GPU usage now requires CUDA 11.8 and cuDNN 8.9.
+* TensorFlow updated to 2.12.0 and, correspondingly, GPU usage now requires CUDA 11.8 and cuDNN 8.9.
 * Useage is otherwise identical to v3.1.2.
 * Training data for this release was provided by Mathew Jones at the University of Queensland. This software was developed in collaboration with the `Jones Lab <https://researchers.uq.edu.au/researcher/25051>`_, `Merrick Lab <https://www.path.cam.ac.uk/directory/catherine-merrick>`_, and `McClelland Lab <https://www.bartscancer.london/staff/professor-sarah-mcclelland/>`_ to whom we are grateful for their collaboration and support. We are particularly grateful to them for supporting the release of the software to the community ahead of publication.
 
@@ -33,7 +43,7 @@ v3.0.2
 * dnascent2bedgraph utility updated to plot both EdU and BrdU tracks in genome browsers,
 * ``DNAscent regions`` is now deprecated and has been fully superceded by ``DNAscent forkSense``,
 * ``DNAscent psl`` is now deprecated as reads can be more comprehensively plotted using the dnascent2bedgraph utility,
-* Migration from Tensorflow 1.14 to 2.4.1 and, correspondingly, GPU usage now requires CUDA 11 and cuDNN 8,
+* Migration from TensorFlow 1.14 to 2.4.1 and, correspondingly, GPU usage now requires CUDA 11 and cuDNN 8,
 * Released with `Totanes FIG,  Gockel J,  Chapman SE, Bartfai R, Boemo MA, Merrick CJ. Replication origin mapping in the malaria parasite Plasmodium falciparum. bioRxiv <https://doi.org/10.1101/2022.07.27.501677>`_.
 
 v2.0.0

diff --git a/docs/source/workflows.rst b/docs/source/workflows.rst
@@ -3,12 +3,12 @@
 Workflow
 ===============================
 
-The following is a full DNAscent workflow, where we'll start off after Guppy has finished running (users that need help with Guppy should refer to the `Oxford Nanopore webpages <https://nanoporetech.com/nanopore-sequencing-data-analysis>`_). The recommended Guppy basecalling configuration file for v4.0.1 is ``dna_r10.4.1_e8.2_400bps_5khz_hac.cfg``.
+The following is a full DNAscent workflow, where we'll start off after Guppy has finished running (users that need help with Guppy should refer to the `Oxford Nanopore webpages <https://nanoporetech.com/nanopore-sequencing-data-analysis>`_). The recommended Guppy basecalling configuration file for v4.0.2 is ``dna_r10.4.1_e8.2_400bps_5khz_hac.cfg``.
 In particular, we assume the following:
 
-* you have a directory of R10.4.1 Oxford Nanopore fast5 reads (which may be in subdirectories) that you want to use for detection,
-* these reads have been basecalled to fastq format using Guppy (available from Oxford Nanopore),
-* you have a reference/genome file (in fasta format) for your reads.
+* You have a directory of R10.4.1 Oxford Nanopore FAST5 reads (which may be in subdirectories) that you want to use for detection. Currently, DNAscent only supports FAST5 format. You can convert POD5 to FAST5 using the ``pod5 convert to_fast5`` utility (https://pypi.org/project/pod5/#pod5-convert-to_fast5).
+* Reads have been basecalled to fastq format using Guppy (available from Oxford Nanopore).
+* You have a reference/genome file (in fasta format) for your reads.
 
 Example Workflow
 ----------------
@@ -19,7 +19,7 @@ Download and compile DNAscent:
 
    git clone --recursive https://github.com/MBoemo/DNAscent.git
    cd DNAscent
-   git checkout 4.0.1
+   git checkout 4.0.2
    make
    cd ..
 

diff --git a/src/common.h b/src/common.h
@@ -9,7 +9,7 @@
 #ifndef COMMON_H
 #define COMMON_H
 
-#define VERSION "4.0.1"
+#define VERSION "4.0.2"
 
 #include <algorithm>
 #include <vector>

diff --git a/src/forkSense.cpp b/src/forkSense.cpp
@@ -279,9 +279,9 @@ void callSegmentation(DetectedRead &r){
 
 			if ( abs(endCoord - startCoord) >= minLength ){
 
-				//std::pair<int, int> trim = segmentationTrim(r.positions, r.eduCalls, r.brduCalls, startIdx, endIdx);
-				//startIdx += trim.first;
-				//endIdx -= trim.second;
+				std::pair<int, int> trim = segmentationTrim(r.positions, r.eduCalls, r.brduCalls, startIdx, endIdx);
+				startIdx += trim.first;
+				endIdx -= trim.second;
 				startCoord = r.positions[startIdx];
 				endCoord = r.positions[endIdx];
 
@@ -309,9 +309,9 @@ void callSegmentation(DetectedRead &r){
 
 		if ( abs(endCoord - startCoord) >= minLength ){
 
-			//std::pair<int, int> trim = segmentationTrim(r.positions, r.eduCalls, r.brduCalls, startIdx, endIdx);
-			//startIdx += trim.first;
-			//endIdx -= trim.second;
+			std::pair<int, int> trim = segmentationTrim(r.positions, r.eduCalls, r.brduCalls, startIdx, endIdx);
+			startIdx += trim.first;
+			endIdx -= trim.second;
 			startCoord = r.positions[startIdx];
 			endCoord = r.positions[endIdx];
 
@@ -345,9 +345,9 @@ void callSegmentation(DetectedRead &r){
 
 			if ( abs(endCoord - startCoord) >= minLength ){
 
-				//std::pair<int, int> trim = segmentationTrim(r.positions, r.brduCalls, r.eduCalls, startIdx, endIdx);
-				//startIdx += trim.first;
-				//endIdx -= trim.second;
+				std::pair<int, int> trim = segmentationTrim(r.positions, r.brduCalls, r.eduCalls, startIdx, endIdx);
+				startIdx += trim.first;
+				endIdx -= trim.second;
 				startCoord = r.positions[startIdx];
 				endCoord = r.positions[endIdx];
 
@@ -375,9 +375,9 @@ void callSegmentation(DetectedRead &r){
 
 		if ( abs(endCoord - startCoord) >= minLength ){
 
-			//std::pair<int, int> trim = segmentationTrim(r.positions, r.brduCalls, r.eduCalls, startIdx, endIdx);
-			//startIdx += trim.first;
-			//endIdx -= trim.second;
+			std::pair<int, int> trim = segmentationTrim(r.positions, r.brduCalls, r.eduCalls, startIdx, endIdx);
+			startIdx += trim.first;
+			endIdx -= trim.second;
 			startCoord = r.positions[startIdx];
 			endCoord = r.positions[endIdx];
 
@@ -952,13 +952,13 @@ std::pair<int, int> segmentationTrim(std::vector< int > &positions, std::vector<
 
 	std::vector<double> segmentDensities;
 	int maxCallsInd = segmentCalls.size();
-	for (int i = 0; i < maxCallsInd; i++){
+
+	for (int i = 0.33*maxCallsInd; i < 0.66*maxCallsInd; i++){
 
 		int positiveCalls = 0;
 		int attempts = 0;
 		int lb = std::max(0,i-epsilon);
-		int segmentIdxLen = segmentCalls.size();
-		int ub = std::min(segmentIdxLen, i+epsilon);
+		int ub = std::min(maxCallsInd, i+epsilon);
 
 		for (int j = lb; j < ub; j++){
 
@@ -1012,7 +1012,7 @@ void callStalls(DetectedRead &r, std::string analogueOrder, KMeansResult analoug
 	}
 
 	//non-linear scaling parameters for stall score
-	double beta = 10.; //higher values of beta mean more conservative stall scores
+	double beta = 1.; //higher values of beta mean more conservative stall scores
 	double alpha = 1./log(2./(1.+exp(-1.*beta))); //set alpha so that non-linear scaling of 1 is equal to 1
 
 	//check right forks
@@ -1410,7 +1410,7 @@ KMeansResult estimateAnalogueIncorporation(std::string detectFilename, int readC
 
 	std::vector< double > BrdU_callFractions, EdU_callFractions;
 
-	int resolution = 1000; //look in 1 kb segments
+	int resolution = 2000; //look in 2 kb segments
 
 	int startingPos = -1;
 	int progress = 0;

diff --git a/src/forkSense.h b/src/forkSense.h
@@ -1,5 +1,6 @@
 //----------------------------------------------------------
 // Copyright 2020 University of Cambridge
+// Written by Michael A. Boemo ([email protected])
 // This software is licensed under GPL-3.0.  You should have
 // received a copy of the license with this software.  If
 // not, please Email the author.