Skip to content

Commit

Permalink
bugfixes: human-readable forkSense, modbam writing after dorado 5mC c…
Browse files Browse the repository at this point in the history
…alling; QOL: more informative error messages on vbz and forkSense
  • Loading branch information
M. Boemo committed Nov 11, 2024
1 parent 5bc3279 commit 1432cf7
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 22 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ htslib/
.cproject
.pydevproject
.settings/*
.vscode/settings.json

#commit log file and path
src/gitcommit.h
Expand Down
27 changes: 27 additions & 0 deletions src/error_handling.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,26 @@ struct IOerror : public std::exception {
}
};

struct VBZError : public std::exception {
std::string filename;
VBZError( std::string s ){

filename = s;
}
const char* what () const throw () {
const char* message = "This fast5 file is compressed with vbz: ";
const char* message2 = "\nSee documentation (https://dnascent.readthedocs.io/en/latest/installation.html#building-from-source) for how to load vbz plugin.";
const char* specifier = filename.c_str();
char* result;
result = static_cast<char*>(calloc(strlen(message)+strlen(message2)+strlen(specifier)+1, sizeof(char)));
strcpy( result, message);
strcat( result, specifier );
strcat( result, message2 );

return result;
}
};


struct InvalidExtension : public std::exception {
std::string badFilename;
Expand Down Expand Up @@ -210,6 +230,13 @@ struct MismatchedDimensions : public std::exception {
};


struct ForkSenseData : public std::exception {
const char * what () const throw () {
return "Not enough data was passed to forkSense to reliably segment analogue regions. Data passed in bam or detect format and should include at least several hundred reads.";
}
};


struct ParsingError : public std::exception {
const char * what () const throw () {
return "Parsing error reading BAM file.";
Expand Down
36 changes: 34 additions & 2 deletions src/fast5.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,20 @@ void fast5_getSignal( DNAscent::read &r ){
hid_t hdf5_file = H5Fopen(filename.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
if (hdf5_file < 0) throw IOerror(filename.c_str());

//check for vbz compression and fail if plugin is not loaded
bool vbz_compressed = false;
unsigned int flags;
size_t nelmts = 1;
unsigned int values_out[1] = {99};
char filter_name[80];
std::string signal_path = "/read_" + readID + "/Raw/Signal";
hid_t dcheck = H5Dopen(hdf5_file, signal_path.c_str(), H5P_DEFAULT);
hid_t dcpl = H5Dget_create_plist(dcheck);
H5Z_filter_t filter_id = H5Pget_filter2(dcpl, (unsigned) 0, &flags, &nelmts, values_out, sizeof(filter_name) - 1, filter_name, NULL);
H5Pclose (dcpl);
H5Dclose (dcheck);
if(filter_id == 32020) vbz_compressed = true;

//get the channel parameters
std::string scaling_path = "/read_" + readID + "/channel_id";
hid_t scaling_group = H5Gopen(hdf5_file, scaling_path.c_str(), H5P_DEFAULT);
Expand All @@ -65,18 +79,20 @@ void fast5_getSignal( DNAscent::read &r ){
float raw_unit;
float *rawptr = NULL;

std::string signal_path = "/read_" + readID + "/Raw/Signal";
hid_t dset = H5Dopen(hdf5_file, signal_path.c_str(), H5P_DEFAULT);
if (dset < 0 ) throw BadFast5Field();
space = H5Dget_space(dset);
if (space < 0 ) throw BadFast5Field();
H5Sget_simple_extent_dims(space, &nsample, NULL);
rawptr = (float*)calloc(nsample, sizeof(float));
herr_t status = H5Dread(dset, H5T_NATIVE_FLOAT, H5S_ALL, H5S_ALL, H5P_DEFAULT, rawptr);
herr_t status = H5Dread(dset, H5T_NATIVE_FLOAT, H5S_ALL, H5S_ALL, H5P_DEFAULT, rawptr);

if ( status < 0 ){
free(rawptr);
H5Dclose(dset);

if (vbz_compressed) throw VBZError(filename);

throw BadFast5Field();
}
H5Dclose(dset);
Expand All @@ -92,6 +108,15 @@ void fast5_getSignal( DNAscent::read &r ){

r.raw = signal_pA;

//fail on empty signal
if (r.raw.size() == 0){

std::cerr << "Empty signal found in fast5 file." << std::endl;
std::cerr << " ReadID: " << r.readID << std::endl;
std::cerr << " Filename: " << r.filename << std::endl;
exit(EXIT_FAILURE);
}

free(rawptr);

H5Fclose(hdf5_file);
Expand Down Expand Up @@ -193,6 +218,13 @@ std::vector<std::string> fast5_extract_readIDs(std::string filepath){
// copy the group name
H5Lget_name_by_idx(hdf5_file, "/", H5_INDEX_NAME, H5_ITER_INC, group_idx, buffer, buffer_size, H5P_DEFAULT);
buffer[size] = '\0';

char prefix[] = "read_";
size_t len = strlen(prefix);
if (strncmp(buffer, prefix, len) == 0) {
memmove(buffer, buffer + len, strlen(buffer + len) + 1);
}

out.push_back(buffer);

}
Expand Down
44 changes: 25 additions & 19 deletions src/forkSense.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1380,7 +1380,7 @@ KMeansResult twoMeans_fs( std::vector< double > &observations ){

if ( std::abs(C2_points_old[i] - C1_old) < std::abs(C2_points_old[i] - C2_old) ) C1_points_new.push_back(C2_points_old[i]);
else C2_points_new.push_back(C2_points_old[i]);
}
}

C1_new = vectorMean(C1_points_new);
C2_new = vectorMean(C2_points_new);
Expand All @@ -1390,15 +1390,15 @@ KMeansResult twoMeans_fs( std::vector< double > &observations ){

iter++;
}while (iter < maxIter and (std::abs(C1_old - C1_new)>tol or std::abs(C2_old - C2_new)>tol));

//calculate lower bounds from K-means segmentation
auto C1_lowerBound = std::min_element( C1_points_old.begin(), C1_points_old.end() );
auto C2_lowerBound = std::min_element( C2_points_old.begin(), C2_points_old.end() );

//more conservative option - lower bound is mean - 1 stdv
double C1_stdv = vectorStdv( C1_points_old, C1_new );
double C2_stdv = vectorStdv( C2_points_old, C2_new );

KMeansResult out = {C1_new, *C1_lowerBound, C1_stdv, C2_new, *C2_lowerBound, C2_stdv};

return out;
Expand Down Expand Up @@ -1624,6 +1624,9 @@ void iterateOnHumanReadable(forkSenseArgs &args, fs_fileManager &fm, KMeansResul
std::vector<double> BrdUCalls, EdUCalls;
std::vector<int> refCoords;

std::string readID, chromosome, strand;
int mappingLower, mappingUpper;

std::vector< std::shared_ptr<DNAscent::detectedRead> > buffer;
int failed = 0;
int progress = 0;
Expand All @@ -1638,11 +1641,19 @@ void iterateOnHumanReadable(forkSenseArgs &args, fs_fileManager &fm, KMeansResul
progress++;
pb.displayProgress( progress, failed, 0 );

//add this read to the buffer if it's sufficiently long enough
if (refCoords.size() > 2000){

std::shared_ptr<DNAscent::detectedRead> r = std::make_shared<DNAscent::detectedRead>(readID, chromosome, mappingLower, mappingUpper, strand, refCoords, EdUCalls, BrdUCalls);
buffer.push_back(r);
}

//empty the buffer if it's full
if (buffer.size() >= maxBufferSize) emptyBuffer(buffer, args, fm, analogueIncorporation);

std::stringstream ssLine(line);
std::string column;
int cIndex = 0;
std::string readID, chromosome, strand;
int mappingLower, mappingUpper;
while ( std::getline( ssLine, column, ' ' ) ){

if ( cIndex == 0 ) readID = column;
Expand All @@ -1653,20 +1664,6 @@ void iterateOnHumanReadable(forkSenseArgs &args, fs_fileManager &fm, KMeansResul
else throw DetectParsing();
cIndex++;
}

//add this read to the buffer if it's sufficiently long enough
if (refCoords.size() > 2000){

std::shared_ptr<DNAscent::detectedRead> r = std::make_shared<DNAscent::detectedRead>(readID, chromosome, mappingLower, mappingUpper, strand, refCoords, EdUCalls, BrdUCalls);
buffer.push_back(r);
}
else{
failed++;
}


//empty the buffer if it's full
if (buffer.size() >= maxBufferSize) emptyBuffer(buffer, args, fm, analogueIncorporation);

//clear the analogue and reference vectors
BrdUCalls.clear();
Expand Down Expand Up @@ -1705,6 +1702,13 @@ void iterateOnHumanReadable(forkSenseArgs &args, fs_fileManager &fm, KMeansResul
}
}

//add this read to the buffer if it's sufficiently long enough
if (refCoords.size() > 2000){

std::shared_ptr<DNAscent::detectedRead> r = std::make_shared<DNAscent::detectedRead>(readID, chromosome, mappingLower, mappingUpper, strand, refCoords, EdUCalls, BrdUCalls);
buffer.push_back(r);
}

//empty the buffer at the end
if (buffer.size() > 0) emptyBuffer(buffer, args, fm, analogueIncorporation);

Expand Down Expand Up @@ -1765,6 +1769,8 @@ int sense_main( int argc, char** argv ){
if (args.humanReadable) callFractions_HR(args.detectFilename, BrdU_callFractions, EdU_callFractions, readCount);
else callFractions_modbam(args.detectFilename, BrdU_callFractions, EdU_callFractions, readCount, args.threads);

if (BrdU_callFractions.size() < 10 or EdU_callFractions.size() < 10) throw ForkSenseData();

KMeansResult analogueIncorporation = estimateAnalogueIncorporation(BrdU_callFractions, EdU_callFractions);

fs_fileManager fm(args, analogueIncorporation);
Expand Down
11 changes: 11 additions & 0 deletions src/pod5.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,17 @@ void pod5_getSignal( DNAscent::read &r ){
r.raw.push_back( ((float) samples[i] + (float) read_data.calibration_offset) * (float) read_data.calibration_scale );
}

//fail on empty signal
if (r.raw.size() == 0){

std::cerr << "Empty signal found in pod5 file." << std::endl;
std::cerr << " ReadID: " << r.readID << std::endl;
std::cerr << " Filename: " << r.filename << std::endl;
std::cerr << " Pod5 batch: " << r.pod5_batch << std::endl;
std::cerr << " Pod5 row: " << r.pod5_row << std::endl;
exit(EXIT_FAILURE);
}

//if this is a bam file generated by dorado, apply the appropriate signal slicing
if ( r.signalLength > 0){

Expand Down
4 changes: 3 additions & 1 deletion src/reads.h
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,7 @@ namespace DNAscent {
uint8_t *tagData_MM = bam_aux_get(record, "MM");
if (tagData_MM != nullptr) {
existing_MMtag = bam_aux2Z(tagData_MM);
bam_aux_del(record,tagData_MM);
}

//build the base analogue MM tag for this read and populate vectors of analogue calls
Expand Down Expand Up @@ -491,7 +492,8 @@ namespace DNAscent {
for (uint32_t i = 0; i < tagLength_ML; i++){

probabilities_concat.push_back( bam_auxB2i(tagData_ML,i) );
}
}
bam_aux_del(record,tagData_ML);
}

//concatenate the base analogue calls onto the existing tag (if there is one)
Expand Down

0 comments on commit 1432cf7

Please sign in to comment.