From 1432cf75f1e57920856f1ca465ad3bc5650ebbe9 Mon Sep 17 00:00:00 2001
From: "M. Boemo" <mb915@login-q-1.data.cluster>
Date: Mon, 11 Nov 2024 18:40:10 +0000
Subject: [PATCH] bugfixes: human-readable forkSense, modbam writing after
 dorado 5mC calling; QOL: more informative error messages on vbz and forkSense

---
 .gitignore           |  1 +
 src/error_handling.h | 27 +++++++++++++++++++++++++++
 src/fast5.cpp        | 36 ++++++++++++++++++++++++++++++++++--
 src/forkSense.cpp    | 44 +++++++++++++++++++++++++-------------------
 src/pod5.cpp         | 11 +++++++++++
 src/reads.h          |  4 +++-
 6 files changed, 101 insertions(+), 22 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2923460..7b86301 100755
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,7 @@ htslib/
 .cproject
 .pydevproject
 .settings/*
+.vscode/settings.json
 
 #commit log file and path
 src/gitcommit.h
diff --git a/src/error_handling.h b/src/error_handling.h
index 8e4e5e1..fee818d 100755
--- a/src/error_handling.h
+++ b/src/error_handling.h
@@ -34,6 +34,26 @@ struct IOerror : public std::exception {
 	}	
 };
 
+struct VBZError : public std::exception {
+	std::string filename;	
+	VBZError( std::string s ){
+
+		filename = s;
+	}
+	const char* what () const throw () {
+		const char* message = "This fast5 file is compressed with vbz: ";
+		const char* message2 = "\nSee documentation (https://dnascent.readthedocs.io/en/latest/installation.html#building-from-source) for how to load vbz plugin.";
+		const char* specifier = filename.c_str();
+		char* result;
+		result = static_cast<char*>(calloc(strlen(message)+strlen(message2)+strlen(specifier)+1, sizeof(char)));
+		strcpy( result, message);
+		strcat( result, specifier );
+		strcat( result, message2 );
+
+		return result;
+	}	
+};
+
 
 struct InvalidExtension : public std::exception {
 	std::string badFilename;	
@@ -210,6 +230,13 @@ struct MismatchedDimensions : public std::exception {
 };
 
 
+struct ForkSenseData : public std::exception {
+	const char * what () const throw () {
+		return "Not enough data was passed to forkSense to reliably segment analogue regions. Data passed in bam or detect format and should include at least several hundred reads.";
+	}
+};
+
+
 struct ParsingError : public std::exception {
 	const char * what () const throw () {
 		return "Parsing error reading BAM file.";
diff --git a/src/fast5.cpp b/src/fast5.cpp
index a1f21b8..7b45e70 100644
--- a/src/fast5.cpp
+++ b/src/fast5.cpp
@@ -51,6 +51,20 @@ void fast5_getSignal( DNAscent::read &r ){
 	hid_t hdf5_file = H5Fopen(filename.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
 	if (hdf5_file < 0) throw IOerror(filename.c_str());
 
+	//check for vbz compression and fail if plugin is not loaded
+	bool vbz_compressed = false;
+    unsigned int flags;
+	size_t nelmts = 1;
+    unsigned int values_out[1] = {99};
+	char filter_name[80];
+	std::string signal_path = "/read_" + readID + "/Raw/Signal";
+	hid_t dcheck = H5Dopen(hdf5_file, signal_path.c_str(), H5P_DEFAULT);
+    hid_t dcpl = H5Dget_create_plist(dcheck);
+    H5Z_filter_t filter_id = H5Pget_filter2(dcpl, (unsigned) 0, &flags, &nelmts, values_out, sizeof(filter_name) - 1, filter_name, NULL);
+    H5Pclose (dcpl);
+    H5Dclose (dcheck);
+	if(filter_id == 32020) vbz_compressed = true;
+
 	//get the channel parameters
 	std::string scaling_path = "/read_" + readID + "/channel_id";
 	hid_t scaling_group = H5Gopen(hdf5_file, scaling_path.c_str(), H5P_DEFAULT);
@@ -65,18 +79,20 @@ void fast5_getSignal( DNAscent::read &r ){
 	float raw_unit;
 	float *rawptr = NULL;
 
-	std::string signal_path = "/read_" + readID + "/Raw/Signal";
 	hid_t dset = H5Dopen(hdf5_file, signal_path.c_str(), H5P_DEFAULT);
 	if (dset < 0 ) throw BadFast5Field(); 
 	space = H5Dget_space(dset);
 	if (space < 0 ) throw BadFast5Field(); 
 	H5Sget_simple_extent_dims(space, &nsample, NULL);
    	rawptr = (float*)calloc(nsample, sizeof(float));
-    	herr_t status = H5Dread(dset, H5T_NATIVE_FLOAT, H5S_ALL, H5S_ALL, H5P_DEFAULT, rawptr);
+    herr_t status = H5Dread(dset, H5T_NATIVE_FLOAT, H5S_ALL, H5S_ALL, H5P_DEFAULT, rawptr);
 
 	if ( status < 0 ){
 		free(rawptr);
 		H5Dclose(dset);
+
+		if (vbz_compressed) throw VBZError(filename);
+
 		throw BadFast5Field();
 	}
 	H5Dclose(dset);
@@ -92,6 +108,15 @@ void fast5_getSignal( DNAscent::read &r ){
 	
 	r.raw = signal_pA;
 
+	//fail on empty signal
+	if (r.raw.size() == 0){
+
+		std::cerr << "Empty signal found in fast5 file." << std::endl;
+		std::cerr << "   ReadID: " << r.readID << std::endl;
+		std::cerr << "   Filename: " << r.filename << std::endl;
+		exit(EXIT_FAILURE);
+	}
+
 	free(rawptr);
 	
 	H5Fclose(hdf5_file);
@@ -193,6 +218,13 @@ std::vector<std::string> fast5_extract_readIDs(std::string filepath){
         // copy the group name
         H5Lget_name_by_idx(hdf5_file, "/", H5_INDEX_NAME, H5_ITER_INC, group_idx, buffer, buffer_size, H5P_DEFAULT);
         buffer[size] = '\0';
+        
+        char prefix[] = "read_";
+	size_t len = strlen(prefix);
+	if (strncmp(buffer, prefix, len) == 0) {
+		memmove(buffer, buffer + len, strlen(buffer + len) + 1);
+	}       
+        
         out.push_back(buffer);
 	
     }
diff --git a/src/forkSense.cpp b/src/forkSense.cpp
index 821c949..bd66306 100755
--- a/src/forkSense.cpp
+++ b/src/forkSense.cpp
@@ -1380,7 +1380,7 @@ KMeansResult twoMeans_fs( std::vector< double > &observations ){
 
 			if ( std::abs(C2_points_old[i] - C1_old) < std::abs(C2_points_old[i] - C2_old) ) C1_points_new.push_back(C2_points_old[i]);
 			else C2_points_new.push_back(C2_points_old[i]);
-		}	
+		}
 
 		C1_new = vectorMean(C1_points_new);
 		C2_new = vectorMean(C2_points_new);
@@ -1390,7 +1390,7 @@ KMeansResult twoMeans_fs( std::vector< double > &observations ){
 
 		iter++;
 	}while (iter < maxIter and (std::abs(C1_old - C1_new)>tol or std::abs(C2_old - C2_new)>tol));
-	
+
 	//calculate lower bounds from K-means segmentation
 	auto C1_lowerBound = std::min_element( C1_points_old.begin(), C1_points_old.end() );
 	auto C2_lowerBound = std::min_element( C2_points_old.begin(), C2_points_old.end() );
@@ -1398,7 +1398,7 @@ KMeansResult twoMeans_fs( std::vector< double > &observations ){
 	//more conservative option - lower bound is mean - 1 stdv
 	double C1_stdv = vectorStdv( C1_points_old, C1_new );
 	double C2_stdv = vectorStdv( C2_points_old, C2_new );
-	
+
 	KMeansResult out = {C1_new, *C1_lowerBound, C1_stdv, C2_new, *C2_lowerBound, C2_stdv};
 
 	return out;
@@ -1624,6 +1624,9 @@ void iterateOnHumanReadable(forkSenseArgs &args, fs_fileManager &fm, KMeansResul
 	std::vector<double> BrdUCalls, EdUCalls;
 	std::vector<int> refCoords;
 
+	std::string readID, chromosome, strand;
+	int mappingLower, mappingUpper;
+
 	std::vector< std::shared_ptr<DNAscent::detectedRead> > buffer;
 	int failed = 0;
 	int progress = 0;
@@ -1638,11 +1641,19 @@ void iterateOnHumanReadable(forkSenseArgs &args, fs_fileManager &fm, KMeansResul
 			progress++;
 			pb.displayProgress( progress, failed, 0 );
 
+			//add this read to the buffer if it's sufficiently long enough
+			if (refCoords.size() > 2000){
+			
+				std::shared_ptr<DNAscent::detectedRead> r = std::make_shared<DNAscent::detectedRead>(readID, chromosome, mappingLower, mappingUpper, strand, refCoords, EdUCalls, BrdUCalls);
+				buffer.push_back(r);
+			}
+
+			//empty the buffer if it's full
+			if (buffer.size() >= maxBufferSize) emptyBuffer(buffer, args, fm, analogueIncorporation);
+
 			std::stringstream ssLine(line);
 			std::string column;
 			int cIndex = 0;
-			std::string readID, chromosome, strand;
-			int mappingLower, mappingUpper;
 			while ( std::getline( ssLine, column, ' ' ) ){
 
 				if ( cIndex == 0 ) readID = column;
@@ -1653,20 +1664,6 @@ void iterateOnHumanReadable(forkSenseArgs &args, fs_fileManager &fm, KMeansResul
 				else throw DetectParsing();
 				cIndex++;
 			}
-			
-			//add this read to the buffer if it's sufficiently long enough
-			if (refCoords.size() > 2000){
-			
-				std::shared_ptr<DNAscent::detectedRead> r = std::make_shared<DNAscent::detectedRead>(readID, chromosome, mappingLower, mappingUpper, strand, refCoords, EdUCalls, BrdUCalls);
-				buffer.push_back(r);
-			}
-			else{
-				failed++;
-			}
-
-
-			//empty the buffer if it's full
-			if (buffer.size() >= maxBufferSize) emptyBuffer(buffer, args, fm, analogueIncorporation);
 
 			//clear the analogue and reference vectors
 			BrdUCalls.clear();
@@ -1705,6 +1702,13 @@ void iterateOnHumanReadable(forkSenseArgs &args, fs_fileManager &fm, KMeansResul
 		}
 	}
 
+	//add this read to the buffer if it's sufficiently long enough
+	if (refCoords.size() > 2000){
+	
+		std::shared_ptr<DNAscent::detectedRead> r = std::make_shared<DNAscent::detectedRead>(readID, chromosome, mappingLower, mappingUpper, strand, refCoords, EdUCalls, BrdUCalls);
+		buffer.push_back(r);
+	}
+
 	//empty the buffer at the end
 	if (buffer.size() > 0) emptyBuffer(buffer, args, fm, analogueIncorporation);
 
@@ -1765,6 +1769,8 @@ int sense_main( int argc, char** argv ){
 	if (args.humanReadable)	callFractions_HR(args.detectFilename, BrdU_callFractions, EdU_callFractions, readCount);
 	else callFractions_modbam(args.detectFilename, BrdU_callFractions, EdU_callFractions, readCount, args.threads);
 
+	if (BrdU_callFractions.size() < 10 or EdU_callFractions.size() < 10) throw ForkSenseData();
+
 	KMeansResult analogueIncorporation = estimateAnalogueIncorporation(BrdU_callFractions, EdU_callFractions);
 
  	fs_fileManager fm(args, analogueIncorporation);
diff --git a/src/pod5.cpp b/src/pod5.cpp
index 67e1213..10a5e58 100644
--- a/src/pod5.cpp
+++ b/src/pod5.cpp
@@ -60,6 +60,17 @@ void pod5_getSignal( DNAscent::read &r ){
 		r.raw.push_back( ((float) samples[i] + (float) read_data.calibration_offset) * (float) read_data.calibration_scale );
 	}
 
+	//fail on empty signal
+	if (r.raw.size() == 0){
+
+		std::cerr << "Empty signal found in pod5 file." << std::endl;
+		std::cerr << "   ReadID: " << r.readID << std::endl;
+		std::cerr << "   Filename: " << r.filename << std::endl;
+		std::cerr << "   Pod5 batch: " << r.pod5_batch << std::endl;
+		std::cerr << "   Pod5 row: " << r.pod5_row << std::endl;
+		exit(EXIT_FAILURE);
+	}
+
 	//if this is a bam file generated by dorado, apply the appropriate signal slicing
 	if ( r.signalLength > 0){
 	
diff --git a/src/reads.h b/src/reads.h
index c24e721..49f5f30 100755
--- a/src/reads.h
+++ b/src/reads.h
@@ -451,6 +451,7 @@ namespace DNAscent {
 				uint8_t *tagData_MM = bam_aux_get(record, "MM");
 				if (tagData_MM != nullptr) {
 					existing_MMtag = bam_aux2Z(tagData_MM);
+					bam_aux_del(record,tagData_MM);
 				}
 				
 				//build the base analogue MM tag for this read and populate vectors of analogue calls
@@ -491,7 +492,8 @@ namespace DNAscent {
 					for (uint32_t i = 0; i < tagLength_ML; i++){
 					
 						probabilities_concat.push_back( bam_auxB2i(tagData_ML,i) );
-					}		
+					}
+					bam_aux_del(record,tagData_ML);		
 				}
 				
 				//concatenate the base analogue calls onto the existing tag (if there is one)