From 1432cf75f1e57920856f1ca465ad3bc5650ebbe9 Mon Sep 17 00:00:00 2001 From: "M. Boemo" Date: Mon, 11 Nov 2024 18:40:10 +0000 Subject: [PATCH] bugfixes: human-readable forkSense, modbam writing after dorado 5mC calling; QOL: more informative error messages on vbz and forkSense --- .gitignore | 1 + src/error_handling.h | 27 +++++++++++++++++++++++++++ src/fast5.cpp | 36 ++++++++++++++++++++++++++++++++++-- src/forkSense.cpp | 44 +++++++++++++++++++++++++------------------- src/pod5.cpp | 11 +++++++++++ src/reads.h | 4 +++- 6 files changed, 101 insertions(+), 22 deletions(-) diff --git a/.gitignore b/.gitignore index 2923460..7b86301 100755 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,7 @@ htslib/ .cproject .pydevproject .settings/* +.vscode/settings.json #commit log file and path src/gitcommit.h diff --git a/src/error_handling.h b/src/error_handling.h index 8e4e5e1..fee818d 100755 --- a/src/error_handling.h +++ b/src/error_handling.h @@ -34,6 +34,26 @@ struct IOerror : public std::exception { } }; +struct VBZError : public std::exception { + std::string filename; + VBZError( std::string s ){ + + filename = s; + } + const char* what () const throw () { + const char* message = "This fast5 file is compressed with vbz: "; + const char* message2 = "\nSee documentation (https://dnascent.readthedocs.io/en/latest/installation.html#building-from-source) for how to load vbz plugin."; + const char* specifier = filename.c_str(); + char* result; + result = static_cast(calloc(strlen(message)+strlen(message2)+strlen(specifier)+1, sizeof(char))); + strcpy( result, message); + strcat( result, specifier ); + strcat( result, message2 ); + + return result; + } +}; + struct InvalidExtension : public std::exception { std::string badFilename; @@ -210,6 +230,13 @@ struct MismatchedDimensions : public std::exception { }; +struct ForkSenseData : public std::exception { + const char * what () const throw () { + return "Not enough data was passed to forkSense to reliably segment analogue regions. Data passed in bam or detect format and should include at least several hundred reads."; + } +}; + + struct ParsingError : public std::exception { const char * what () const throw () { return "Parsing error reading BAM file."; diff --git a/src/fast5.cpp b/src/fast5.cpp index a1f21b8..7b45e70 100644 --- a/src/fast5.cpp +++ b/src/fast5.cpp @@ -51,6 +51,20 @@ void fast5_getSignal( DNAscent::read &r ){ hid_t hdf5_file = H5Fopen(filename.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT); if (hdf5_file < 0) throw IOerror(filename.c_str()); + //check for vbz compression and fail if plugin is not loaded + bool vbz_compressed = false; + unsigned int flags; + size_t nelmts = 1; + unsigned int values_out[1] = {99}; + char filter_name[80]; + std::string signal_path = "/read_" + readID + "/Raw/Signal"; + hid_t dcheck = H5Dopen(hdf5_file, signal_path.c_str(), H5P_DEFAULT); + hid_t dcpl = H5Dget_create_plist(dcheck); + H5Z_filter_t filter_id = H5Pget_filter2(dcpl, (unsigned) 0, &flags, &nelmts, values_out, sizeof(filter_name) - 1, filter_name, NULL); + H5Pclose (dcpl); + H5Dclose (dcheck); + if(filter_id == 32020) vbz_compressed = true; + //get the channel parameters std::string scaling_path = "/read_" + readID + "/channel_id"; hid_t scaling_group = H5Gopen(hdf5_file, scaling_path.c_str(), H5P_DEFAULT); @@ -65,18 +79,20 @@ void fast5_getSignal( DNAscent::read &r ){ float raw_unit; float *rawptr = NULL; - std::string signal_path = "/read_" + readID + "/Raw/Signal"; hid_t dset = H5Dopen(hdf5_file, signal_path.c_str(), H5P_DEFAULT); if (dset < 0 ) throw BadFast5Field(); space = H5Dget_space(dset); if (space < 0 ) throw BadFast5Field(); H5Sget_simple_extent_dims(space, &nsample, NULL); rawptr = (float*)calloc(nsample, sizeof(float)); - herr_t status = H5Dread(dset, H5T_NATIVE_FLOAT, H5S_ALL, H5S_ALL, H5P_DEFAULT, rawptr); + herr_t status = H5Dread(dset, H5T_NATIVE_FLOAT, H5S_ALL, H5S_ALL, H5P_DEFAULT, rawptr); if ( status < 0 ){ free(rawptr); H5Dclose(dset); + + if (vbz_compressed) throw VBZError(filename); + throw BadFast5Field(); } H5Dclose(dset); @@ -92,6 +108,15 @@ void fast5_getSignal( DNAscent::read &r ){ r.raw = signal_pA; + //fail on empty signal + if (r.raw.size() == 0){ + + std::cerr << "Empty signal found in fast5 file." << std::endl; + std::cerr << " ReadID: " << r.readID << std::endl; + std::cerr << " Filename: " << r.filename << std::endl; + exit(EXIT_FAILURE); + } + free(rawptr); H5Fclose(hdf5_file); @@ -193,6 +218,13 @@ std::vector fast5_extract_readIDs(std::string filepath){ // copy the group name H5Lget_name_by_idx(hdf5_file, "/", H5_INDEX_NAME, H5_ITER_INC, group_idx, buffer, buffer_size, H5P_DEFAULT); buffer[size] = '\0'; + + char prefix[] = "read_"; + size_t len = strlen(prefix); + if (strncmp(buffer, prefix, len) == 0) { + memmove(buffer, buffer + len, strlen(buffer + len) + 1); + } + out.push_back(buffer); } diff --git a/src/forkSense.cpp b/src/forkSense.cpp index 821c949..bd66306 100755 --- a/src/forkSense.cpp +++ b/src/forkSense.cpp @@ -1380,7 +1380,7 @@ KMeansResult twoMeans_fs( std::vector< double > &observations ){ if ( std::abs(C2_points_old[i] - C1_old) < std::abs(C2_points_old[i] - C2_old) ) C1_points_new.push_back(C2_points_old[i]); else C2_points_new.push_back(C2_points_old[i]); - } + } C1_new = vectorMean(C1_points_new); C2_new = vectorMean(C2_points_new); @@ -1390,7 +1390,7 @@ KMeansResult twoMeans_fs( std::vector< double > &observations ){ iter++; }while (iter < maxIter and (std::abs(C1_old - C1_new)>tol or std::abs(C2_old - C2_new)>tol)); - + //calculate lower bounds from K-means segmentation auto C1_lowerBound = std::min_element( C1_points_old.begin(), C1_points_old.end() ); auto C2_lowerBound = std::min_element( C2_points_old.begin(), C2_points_old.end() ); @@ -1398,7 +1398,7 @@ KMeansResult twoMeans_fs( std::vector< double > &observations ){ //more conservative option - lower bound is mean - 1 stdv double C1_stdv = vectorStdv( C1_points_old, C1_new ); double C2_stdv = vectorStdv( C2_points_old, C2_new ); - + KMeansResult out = {C1_new, *C1_lowerBound, C1_stdv, C2_new, *C2_lowerBound, C2_stdv}; return out; @@ -1624,6 +1624,9 @@ void iterateOnHumanReadable(forkSenseArgs &args, fs_fileManager &fm, KMeansResul std::vector BrdUCalls, EdUCalls; std::vector refCoords; + std::string readID, chromosome, strand; + int mappingLower, mappingUpper; + std::vector< std::shared_ptr > buffer; int failed = 0; int progress = 0; @@ -1638,11 +1641,19 @@ void iterateOnHumanReadable(forkSenseArgs &args, fs_fileManager &fm, KMeansResul progress++; pb.displayProgress( progress, failed, 0 ); + //add this read to the buffer if it's sufficiently long enough + if (refCoords.size() > 2000){ + + std::shared_ptr r = std::make_shared(readID, chromosome, mappingLower, mappingUpper, strand, refCoords, EdUCalls, BrdUCalls); + buffer.push_back(r); + } + + //empty the buffer if it's full + if (buffer.size() >= maxBufferSize) emptyBuffer(buffer, args, fm, analogueIncorporation); + std::stringstream ssLine(line); std::string column; int cIndex = 0; - std::string readID, chromosome, strand; - int mappingLower, mappingUpper; while ( std::getline( ssLine, column, ' ' ) ){ if ( cIndex == 0 ) readID = column; @@ -1653,20 +1664,6 @@ void iterateOnHumanReadable(forkSenseArgs &args, fs_fileManager &fm, KMeansResul else throw DetectParsing(); cIndex++; } - - //add this read to the buffer if it's sufficiently long enough - if (refCoords.size() > 2000){ - - std::shared_ptr r = std::make_shared(readID, chromosome, mappingLower, mappingUpper, strand, refCoords, EdUCalls, BrdUCalls); - buffer.push_back(r); - } - else{ - failed++; - } - - - //empty the buffer if it's full - if (buffer.size() >= maxBufferSize) emptyBuffer(buffer, args, fm, analogueIncorporation); //clear the analogue and reference vectors BrdUCalls.clear(); @@ -1705,6 +1702,13 @@ void iterateOnHumanReadable(forkSenseArgs &args, fs_fileManager &fm, KMeansResul } } + //add this read to the buffer if it's sufficiently long enough + if (refCoords.size() > 2000){ + + std::shared_ptr r = std::make_shared(readID, chromosome, mappingLower, mappingUpper, strand, refCoords, EdUCalls, BrdUCalls); + buffer.push_back(r); + } + //empty the buffer at the end if (buffer.size() > 0) emptyBuffer(buffer, args, fm, analogueIncorporation); @@ -1765,6 +1769,8 @@ int sense_main( int argc, char** argv ){ if (args.humanReadable) callFractions_HR(args.detectFilename, BrdU_callFractions, EdU_callFractions, readCount); else callFractions_modbam(args.detectFilename, BrdU_callFractions, EdU_callFractions, readCount, args.threads); + if (BrdU_callFractions.size() < 10 or EdU_callFractions.size() < 10) throw ForkSenseData(); + KMeansResult analogueIncorporation = estimateAnalogueIncorporation(BrdU_callFractions, EdU_callFractions); fs_fileManager fm(args, analogueIncorporation); diff --git a/src/pod5.cpp b/src/pod5.cpp index 67e1213..10a5e58 100644 --- a/src/pod5.cpp +++ b/src/pod5.cpp @@ -60,6 +60,17 @@ void pod5_getSignal( DNAscent::read &r ){ r.raw.push_back( ((float) samples[i] + (float) read_data.calibration_offset) * (float) read_data.calibration_scale ); } + //fail on empty signal + if (r.raw.size() == 0){ + + std::cerr << "Empty signal found in pod5 file." << std::endl; + std::cerr << " ReadID: " << r.readID << std::endl; + std::cerr << " Filename: " << r.filename << std::endl; + std::cerr << " Pod5 batch: " << r.pod5_batch << std::endl; + std::cerr << " Pod5 row: " << r.pod5_row << std::endl; + exit(EXIT_FAILURE); + } + //if this is a bam file generated by dorado, apply the appropriate signal slicing if ( r.signalLength > 0){ diff --git a/src/reads.h b/src/reads.h index c24e721..49f5f30 100755 --- a/src/reads.h +++ b/src/reads.h @@ -451,6 +451,7 @@ namespace DNAscent { uint8_t *tagData_MM = bam_aux_get(record, "MM"); if (tagData_MM != nullptr) { existing_MMtag = bam_aux2Z(tagData_MM); + bam_aux_del(record,tagData_MM); } //build the base analogue MM tag for this read and populate vectors of analogue calls @@ -491,7 +492,8 @@ namespace DNAscent { for (uint32_t i = 0; i < tagLength_ML; i++){ probabilities_concat.push_back( bam_auxB2i(tagData_ML,i) ); - } + } + bam_aux_del(record,tagData_ML); } //concatenate the base analogue calls onto the existing tag (if there is one)