From 17915295895b44a5b0c5c8f33c8ac1e668066b25 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Thu, 3 Oct 2024 15:04:42 -0400 Subject: [PATCH] Work on read overlap local realignment --- __main__.py | 26 +------ include/input_data.h | 11 --- include/sv_caller.h | 10 ++- src/contextsv.cpp | 13 ++-- src/input_data.cpp | 22 ------ src/sv_caller.cpp | 173 ++++++++++++++++++++++++++++++++++++++----- 6 files changed, 167 insertions(+), 88 deletions(-) diff --git a/__main__.py b/__main__.py index 7a759120..cc1aecfd 100644 --- a/__main__.py +++ b/__main__.py @@ -153,24 +153,6 @@ def main(): required=False, help=argparse.SUPPRESS ) - - # Turn off CIGAR string SV detection (split-read only) - parser.add_argument( - "--disable-cigar", - required=False, - action="store_true", - default=False, - help=argparse.SUPPRESS - ) - - # Turn off SNP-based CNV predictions for SV classification. - parser.add_argument( - "--disable-snp-cnv", - required=False, - action="store_true", - default=False, - help=argparse.SUPPRESS - ) # ----------------------------------------------------------------------- # @@ -195,8 +177,8 @@ def main(): log.warning("Short read alignment file not provided. Using long read alignment file in its place.") args.short_read = args.long_read - # SNPs file is required unless SNP-based CNV predictions are disabled. - if (args.snps is None and not args.disable_snp_cnv): + # SNPs file is required + if (args.snps is None): log.error("Please provide the SNPs file.") arg_error = True @@ -231,8 +213,6 @@ def main(): input_data.setAlleleFreqFilepaths(args.pfb) input_data.setHMMFilepath(args.hmm) input_data.setOutputDir(args.output) - input_data.setDisableCIGAR(args.disable_cigar) - input_data.setDisableSNPCNV(args.disable_snp_cnv) input_data.saveCNVData(args.save_cnv) input_data.setWindowSize(args.window_size) @@ -246,7 +226,7 @@ def main(): # cnv_data_path = os.path.join(args.output, "cnv_data.tsv") # Generate python-based CNV plots if SNP-based CNV predictions are enabled - if (args.save_cnv and not args.disable_snp_cnv): + if (args.save_cnv): log.info("Generating CNV plots...") # Find all TSV files in the output directory diff --git a/include/input_data.h b/include/input_data.h index 416154ef..d6780d14 100644 --- a/include/input_data.h +++ b/include/input_data.h @@ -94,15 +94,6 @@ class InputData { void setThreadCount(int thread_count); int getThreadCount(); - // Disable CIGAR string SV calling. This is useful for testing. - void setDisableCIGAR(bool disable_cigar); - bool getDisableCIGAR(); - - // Disable SNP-based CNV calling. This is useful for calling SVs on - // assemblies since alignment calling is sufficient. - void setDisableSNPCNV(bool disable_snp_cnv); - bool getDisableSNPCNV(); - // Set the whole genome flag to true if the entire genome is being // analyzed. void setWholeGenome(bool whole_genome); @@ -137,8 +128,6 @@ class InputData { std::map chr_cov; // Map of pre-calculated mean coverage values for each chromosome int thread_count; std::string hmm_filepath; - bool disable_cigar; - bool disable_snp_cnv; std::string cnv_filepath; bool whole_genome; // True if the entire genome is being analyzed bool verbose; // True if verbose output is enabled diff --git a/include/sv_caller.h b/include/sv_caller.h index 43268894..d92609db 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -15,8 +15,9 @@ #include /// @endcond -// SV candidate alignment data (chr, start, end, sequence) -using AlignmentData = std::tuple; +// SV candidate alignment data (chr, start, end, sequence, query start, query +// end, mismatch rate) +using AlignmentData = std::tuple; using AlignmentVector = std::vector; // Query map (query name, alignment vector) @@ -31,8 +32,9 @@ class SVCaller { InputData* input_data; std::mutex sv_mtx; // Mutex for locking the SV data - // Detect SVs from long read alignments in the CIGAR string - void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, SVData& sv_calls); + // Detect SVs from the CIGAR string of a read alignment, and return the + // mismatch rate, and the start and end positions of the query sequence + std::tuple detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, SVData& sv_calls, bool is_primary); // Detect SVs at a region from long read alignments. This is used for // whole genome analysis running in parallel. diff --git a/src/contextsv.cpp b/src/contextsv.cpp index 1d8eb0cf..2dc4cd65 100644 --- a/src/contextsv.cpp +++ b/src/contextsv.cpp @@ -32,14 +32,11 @@ int ContextSV::run() SVCaller sv_caller(*this->input_data); SVData sv_calls = sv_caller.run(); - // Classify SVs based on SNP CNV predictions if enabled - if (this->input_data->getDisableSNPCNV() == false) { - // Call CNVs at SNP positions - std::cout << "Running CNV predictions..." << std::endl; - CNVCaller cnv_caller(*this->input_data); - cnv_caller.run(sv_calls); - std::cout << "CNV predictions complete." << std::endl; - } + // Classify SVs based on copy number predictions + std::cout << "Running copy number predictions..." << std::endl; + CNVCaller cnv_caller(*this->input_data); + cnv_caller.run(sv_calls); + std::cout << "Copy number predictions complete." << std::endl; // Print the total number of SVs called std::cout << "Total SVs called: " << sv_calls.totalCalls() << std::endl; diff --git a/src/input_data.cpp b/src/input_data.cpp index 0b92a699..0384ebd1 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -30,8 +30,6 @@ InputData::InputData() this->thread_count = 1; this->hmm_filepath = "data/wgs.hmm"; this->whole_genome = true; - this->disable_cigar = false; - this->disable_snp_cnv = false; this->verbose = false; this->save_cnv_data = false; } @@ -458,26 +456,6 @@ void InputData::setHMMFilepath(std::string filepath) } } -void InputData::setDisableCIGAR(bool disable_cigar) -{ - this->disable_cigar = disable_cigar; -} - -bool InputData::getDisableCIGAR() -{ - return this->disable_cigar; -} - -void InputData::setDisableSNPCNV(bool disable_snp_cnv) -{ - this->disable_snp_cnv = disable_snp_cnv; -} - -bool InputData::getDisableSNPCNV() -{ - return this->disable_snp_cnv; -} - void InputData::setWholeGenome(bool whole_genome) { this->whole_genome = whole_genome; diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index da598167..183b4be4 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -37,9 +37,6 @@ RegionData SVCaller::detectSVsFromRegion(std::string region) SVData sv_calls; std::string bam_filepath = this->input_data->getLongReadBam(); - // Check if the CIGAR string should be used for SV calling - bool disable_cigar = this->input_data->getDisableCIGAR(); - // Open the BAM file in a thread-safe manner samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); if (fp_in == NULL) { @@ -72,8 +69,8 @@ RegionData SVCaller::detectSVsFromRegion(std::string region) SuppMap supplementary_alignments; while (readNextAlignment(fp_in, itr, bam1) >= 0) { - // Skip secondary and unmapped alignments - if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP) { + // Skip secondary and unmapped alignments, duplicates, and QC failures + if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL) { // Do nothing // Skip alignments with low mapping quality @@ -81,7 +78,6 @@ RegionData SVCaller::detectSVsFromRegion(std::string region) // Do nothing } else { - // Get the QNAME (query template name) for associating split reads std::string qname = bam_get_qname(bam1); @@ -93,14 +89,49 @@ RegionData SVCaller::detectSVsFromRegion(std::string region) int64_t start = bam1->core.pos; int64_t end = bam_endpos(bam1); + // Call SVs directly from the CIGAR string + std::tuple query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true); + double mismatch_rate = std::get<0>(query_info); + int32_t query_start = std::get<1>(query_info); + int32_t query_end = std::get<2>(query_info); + // Add the primary alignment to the map - AlignmentData alignment(chr, start, end, "."); + AlignmentData alignment(chr, start, end, ".", query_start, query_end, mismatch_rate); // primary_alignments[qname].push_back(alignment); primary_alignments[qname] = alignment; - // Call SVs directly from the CIGAR string - if (disable_cigar == false) { - this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls); + // Check if there are supplementary alignments, determine if + // there is overlap, and trim the alignments with the highest + // mismatch rate + auto it = supplementary_alignments.find(qname); + if (it != supplementary_alignments.end()) { + AlignmentVector supp_alignments = it->second; + for (auto& supp_alignment : supp_alignments) { + // Get the query start and end positions + int32_t supp_query_start = std::get<4>(supp_alignment); + int32_t supp_query_end = std::get<5>(supp_alignment); + + // Check if there is overlap between the primary and + // supplementary alignments + int overlap = std::max(0, std::min(query_end, supp_query_end) - std::max(query_start, supp_query_start)); + if (overlap > 0) { + // Calculate the mismatch rate at the overlap + query_ovelap_start = std::max(query_start, supp_query_start); + query_overlap_end = std::min(query_end, supp_query_end); + + // Get the sequence of the primary alignment + std::string primary_seq = bam_get_seq(bam1); + std::string supp_seq = bam_get_seq(supp_alignment); + + // TODO: Calculate the mismatch rate at the overlap + int mismatch_count = 0; + for (int i = query_overlap_start; i < query_overlap_end; i++) { + if (primary_seq[i] != supp_seq[i]) { + mismatch_count++; + } + } + } + } } // Process supplementary alignments @@ -110,10 +141,15 @@ RegionData SVCaller::detectSVsFromRegion(std::string region) std::string chr = bamHdr->target_name[bam1->core.tid]; int32_t start = bam1->core.pos; int32_t end = bam_endpos(bam1); - AlignmentData alignment(chr, start, end, "."); + + // Get CIGAR string information (don't call SVs in the supplementary alignments) + std::tuple query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false); + double mismatch_rate = std::get<0>(query_info); + int32_t query_start = std::get<1>(query_info); + int32_t query_end = std::get<2>(query_info); // Add the supplementary alignment to the map - //supplementary_alignments[qname].push_back(alignment); + AlignmentData alignment(chr, start, end, ".", query_start, query_end, mismatch_rate); auto it = supplementary_alignments.find(qname); if (it == supplementary_alignments.end()) { supplementary_alignments[qname] = {alignment}; @@ -150,14 +186,12 @@ SVCaller::SVCaller(InputData &input_data) { this->input_data = &input_data; } - -// Detect SVs from the CIGAR string of a read alignment. -void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, SVData& sv_calls) +std::tuple SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, SVData& sv_calls, bool is_primary) { // Get the chromosome std::string chr = header->target_name[alignment->core.tid]; - // Get the position + // Get the position of the alignment in the reference genome int32_t pos = alignment->core.pos; // Get the CIGAR string @@ -176,9 +210,17 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, SVData& std::vector threads; std::vector sv_calls_vec; - // Loop through the CIGAR string and process operations + // Loop through the CIGAR string, process operations, detect SVs (primary + // only), update clipped base support, calculate sequence identity for + // potential duplications (primary only), and calculate + // the clipped base support and mismatch rate int32_t ref_pos; int32_t ref_end; + int match_count = 0; + int mismatch_count = 0; + int32_t query_start = 0; // First alignment position in the query + int32_t query_end = 0; // Last alignment position in the query + bool first_op = false; // First alignment operation for the query for (int i = 0; i < cigar_len; i++) { // Get the CIGAR operation @@ -188,7 +230,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, SVData& int op_len = bam_cigar_oplen(cigar[i]); // Check if the CIGAR operation is an insertion - if (op == BAM_CINS) { + if (op == BAM_CINS && is_primary) { // Add the SV if greater than the minimum SV size if (op_len >= this->min_sv_size) { @@ -255,7 +297,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, SVData& } // Check if the CIGAR operation is a deletion - } else if (op == BAM_CDEL) { + } else if (op == BAM_CDEL && is_primary) { // Add the SV if greater than the minimum SV size if (op_len >= this->min_sv_size) { @@ -275,6 +317,46 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, SVData& // Update the clipped base support std::lock_guard lock(this->sv_mtx); sv_calls.updateClippedBaseSupport(chr, pos); + + // Update the query alignment start position + if (!first_op) { + query_start = query_pos + op_len; + first_op = true; + } + } + + // Update match/mismatch counts + if (op == BAM_CEQUAL) { + match_count += op_len; + } else if (op == BAM_CDIFF) { + mismatch_count += op_len; + } else if (op == BAM_CMATCH) { + // Compare read and reference sequences + // Get the sequence from the query + uint8_t* seq_ptr = bam_get_seq(alignment); + std::string cmatch_seq_str = ""; + for (int j = 0; j < op_len; j++) { + cmatch_seq_str += seq_nt16_str[bam_seqi(seq_ptr, query_pos + j)]; + } + + // Get the corresponding reference sequence + int cmatch_pos = pos + 1; // Querying the reference genome is 1-based + std::string cmatch_ref_str = this->input_data->queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1); + + // Check that the two sequence lengths are equal + if (cmatch_seq_str.length() != cmatch_ref_str.length()) { + std::cerr << "ERROR: Sequence lengths do not match" << std::endl; + exit(1); + } + + // Compare the two sequences and update the mismatch count + for (int j = 0; j < op_len; j++) { + if (cmatch_seq_str[j] != cmatch_ref_str[j]) { + mismatch_count++; + } else { + match_count++; + } + } } // Update the reference coordinate based on the CIGAR operation @@ -298,6 +380,17 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, SVData& exit(1); } } + + // Update the query end position + query_end = query_pos; + + // Calculate the mismatch rate + double mismatch_rate = (double)mismatch_count / (double)(match_count + mismatch_count); + std::cout << "Mismatch count: " << mismatch_count << std::endl; + std::cout << "Match count: " << match_count << std::endl; + std::cout << "Mismatch rate (%): " << mismatch_rate * 100 << std::endl; + + return std::tuple(mismatch_rate, query_start, query_end); } // Detect SVs from split read alignments (primary and supplementary) and @@ -439,10 +532,17 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // Get the primary alignment chromosome std::string primary_chr = std::get<0>(primary_alignment); - // Get the start and end positions of the primary alignment + // Get the start and end positions of the primary alignment (reference) int32_t primary_start = std::get<1>(primary_alignment); int32_t primary_end = std::get<2>(primary_alignment); + // Get the query start and end positions + int32_t primary_query_start = std::get<4>(primary_alignment); + int32_t primary_query_end = std::get<5>(primary_alignment); + + // Get the mismatch rate + double primary_mismatch_rate = std::get<6>(primary_alignment); + // Loop through the supplementary alignments and find gaps and overlaps AlignmentVector supp_alignments = supp_map[qname]; for (const auto& supp_alignment : supp_alignments) { @@ -458,9 +558,34 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map } // Get the start and end positions of the supplementary alignment + // (on the reference genome) int32_t supp_start = std::get<1>(supp_alignment); int32_t supp_end = std::get<2>(supp_alignment); + // Get the query start and end positions + int32_t supp_query_start = std::get<4>(supp_alignment); + int32_t supp_query_end = std::get<5>(supp_alignment); + + // Get the mismatch rate + double supp_mismatch_rate = std::get<6>(supp_alignment); + + // Determine if there is overlap between the primary and + // supplementary query sequences + int32_t overlap_start = std::max(primary_query_start, supp_query_start); + int32_t overlap_end = std::min(primary_query_end, supp_query_end); + int32_t overlap_length = overlap_end - overlap_start; + if (overlap_length > 0) { + // Choose the alignment with the lower mismatch rate, + // and trim the overlap from the other alignment + if (primary_mismatch_rate < supp_mismatch_rate) { + // Trim the overlap from the supplementary alignment + supp_start = supp_end - overlap_length; + } else { + // Trim the overlap from the primary alignment + primary_end = primary_start + overlap_length; + } + } + // Gap analysis (deletion or duplication) if (supp_start < primary_start && supp_end < primary_start) { // Gap with supplementary before primary: @@ -468,12 +593,16 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // Use the gap ends as the SV endpoints if (primary_start - supp_end >= this->min_sv_size) { + + // Add the SV call sv_calls.add(supp_chr, supp_end+1, primary_start+1, UNKNOWN, ".", "GAPINNER_A"); sv_count++; } // Also use the alignment ends as the SV endpoints if (primary_end - supp_start >= this->min_sv_size) { + + // Add the SV call sv_calls.add(supp_chr, supp_start+1, primary_end+1, UNKNOWN, ".", "GAPOUTER_A"); sv_count++; } @@ -485,12 +614,16 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // Use the gap ends as the SV endpoints if (supp_start - primary_end >= this->min_sv_size) { + + // Add the SV call sv_calls.add(supp_chr, primary_end+1, supp_start+1, UNKNOWN, ".", "GAPINNER_B"); sv_count++; } // Also use the alignment ends as the SV endpoints if (supp_end - primary_start >= this->min_sv_size) { + + // Add the SV call sv_calls.add(supp_chr, primary_start+1, supp_end+1, UNKNOWN, ".", "GAPOUTER_B"); sv_count++; }