From 103c5f12f5fa73535c7df013773178785d3665f2 Mon Sep 17 00:00:00 2001 From: J35P312 Date: Tue, 14 Jul 2015 17:07:50 +0200 Subject: [PATCH] modified: src/data_structures/Translocation.cpp modified: src/data_structures/Translocation.h modified: src/data_structures/findTranslocationsOnTheFly.cpp --- src/data_structures/Translocation.cpp | 143 +++++++++++++++--- src/data_structures/Translocation.h | 2 + .../findTranslocationsOnTheFly.cpp | 2 + 3 files changed, 128 insertions(+), 19 deletions(-) diff --git a/src/data_structures/Translocation.cpp b/src/data_structures/Translocation.cpp index 695e061..19d7413 100644 --- a/src/data_structures/Translocation.cpp +++ b/src/data_structures/Translocation.cpp @@ -12,7 +12,55 @@ bool sortMate(long i, long j) { return (i < j); } +string VCFHeader(){ + string headerString =""; + //Define fileformat and source + headerString+="##fileformat=VCFv4.1\n"; + headerString+="##source=FindTranslocations\n"; + //define the alowed events + headerString+="##ALT=\n"; + headerString+="##ALT=\n"; + headerString+="##ALT=\n"; + //Define the info fields + headerString+="##INFO=\n"; + headerString+="##INFO=\n"; + headerString+="##INFO=\n"; + headerString+="##INFO=\n"; + headerString+="##INFO=\n"; + headerString+="##INFO=\n"; + headerString+="##INFO=\n"; + headerString+="##INFO=\n"; + headerString+="##INFO=\n"; + headerString+="##INFO=\n"; + headerString+="##INFO=\n"; + headerString+="##INFO=\n"; + headerString+="##INFO=\n"; + headerString+="##INFO=\n"; + headerString+="##INFO=\n"; + //set filters + headerString+="##FILTER=\n"; + headerString+="=##FILTER=\n"; + headerString+="=##FILTER=\n"; + //Header + headerString+="#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; + + return(headerString); +} + +string filterFunction(double RATIO, int linksToB, int LinksFromWindow,float mean_insert, float std_insert,int estimatedDistance){ + string filter = "PASS"; + double linkratio= (double)linksToB/(double)LinksFromWindow; + if(RATIO < 0.4){ + filter="BelowExpectedLinks"; + }else if(linkratio < 0.4){ + filter="FewLinks"; + }else if(estimatedDistance > 2*std_insert + mean_insert or estimatedDistance < -2*std_insert+mean_insert){ + filter="UnexpectedDistance"; + } + + return(filter); +} Window::Window(int max_insert, uint16_t minimum_mapping_quality, bool outtie, float mean_insert, float std_insert, int minimumPairs, @@ -30,13 +78,21 @@ Window::Window(int max_insert, uint16_t minimum_mapping_quality, this->outputFileHeader = outputFileHeader; string inter_chr_events = outputFileHeader + "_inter_chr_events.tab"; this->interChrVariations.open(inter_chr_events.c_str()); - this->interChrVariations << "chrA\tstartOnA\tendOnA\tchrB\tstartOnB\tendOnB\tLinksFromWindow\tLinksToChrB\tLinksToEvent\tCoverageOnChrA\tCoverageOnChrB\tOrientationA\tOrientationB\n"; + this->interChrVariations <<"chrA\tstartOnA\tendOnA\tchrB\tstartOnB\tendOnB\tLinksFromWindow\tLinksToChrB\tLinksToEvent\tCoverageOnChrA\tCoverageOnChrB\tOrientationA\tOrientationB\tExpectedLinksToEvent\tLinksToEvent/ExpectedLinksToEvent\tEstimatedDistance\n"; string intra_chr_events = outputFileHeader + "_intra_chr_events.tab"; this->intraChrVariations.open(intra_chr_events.c_str()); - this->intraChrVariations << "chrA\tstartOnA\tendOnA\tchrB\tstartOnB\tendOnB\tLinksFromWindow\tLinksToChrB\tLinksToEvent\tCoverageOnChrA\tCoverageOnChrB\tOrientationA\tOrientationB\n"; + this->intraChrVariations << "chrA\tstartOnA\tendOnA\tchrB\tstartOnB\tendOnB\tLinksFromWindow\tLinksToChrB\tLinksToEvent\tCoverageOnChrA\tCoverageOnChrB\tOrientationA\tOrientationB\tExpectedLinksToEvent\tLinksToEvent/ExpectedLinksToEvent\tEstimatedDistance\n"; + + string inter_chr_eventsVCF = outputFileHeader + "_inter_chr_events.vcf"; + this->interChrVariationsVCF.open(inter_chr_eventsVCF.c_str()); + this->interChrVariationsVCF << VCFHeader(); + + string intra_chr_eventsVCF = outputFileHeader + "_intra_chr_events.vcf"; + this->intraChrVariationsVCF.open(intra_chr_eventsVCF.c_str()); + this->intraChrVariationsVCF << VCFHeader(); this->coverage = 0; - this->chr =-1; + this->chr =-1; } @@ -146,27 +202,42 @@ float Window::computeCoverageB(int chrB, int start, int end, int32_t secondWindo } //This function accepts a queue with alignments that links to chrB from chrA, the function returns the number of reads that are positioned inside the region on A and have mates that are linking anywhere on B -int findLinksToChr2(queue ReadQueue,long startChrA,long stopChrA){ +vector findLinksToChr2(queue ReadQueue,long startChrA,long stopChrA,long startChrB,long endChrB, int pairsFormingLink){ int linkNumber=0; int QueueSize=ReadQueue.size(); + int estimatedDistance=0; + int lengthWindowA=stopChrA-startChrA+1; + for(int i=0;i= ReadQueue.front().Position ){ linkNumber+=1; + //If the reads are bridging the two windows, calculate the mate and read distance + if(startChrB <= ReadQueue.front().MatePosition and endChrB >= ReadQueue.front().MatePosition){ + estimatedDistance+=lengthWindowA-(ReadQueue.front().Position-startChrA)+(ReadQueue.front().MatePosition-startChrB); + } } ReadQueue.pop(); } - - return(linkNumber); + vector output; + output.push_back(linkNumber); + if(pairsFormingLink > 0){ + output.push_back(estimatedDistance/pairsFormingLink); + }else{ + //if no links exists, the region will not count as an event, thus any number may be returned as the distance(the statistics wont be printed anyway) + output.push_back(-1); + } + return(output); } - vector Window::computeStatisticsA(string bamFileName, int chrB, int start, int end, int32_t WindowLength, string indexFile){ vector statisticsVector; int bases=0; double coverage; + double AverageReadLength=0; BamReader bamFile; BamAlignment currentRead; int linksFromWindow=0; + int nreads=0; if(!bamFile.Open(bamFileName)){ statisticsVector.push_back(-1);statisticsVector.push_back(-1); @@ -185,6 +256,8 @@ vector Window::computeStatisticsA(string bamFileName, int chrB, int star if(alignmentStatus != unmapped and alignmentStatus != lowQualty ) { //calculates the length of a read bases+=currentRead.Length; + AverageReadLength+=currentRead.Length; + nreads++; //if the distance between a pair is too long, the amount of links from the winddoe is increased if(alignmentStatus == pair_wrongChrs or alignmentStatus == pair_wrongDistance) { @@ -195,10 +268,11 @@ vector Window::computeStatisticsA(string bamFileName, int chrB, int star } } } + AverageReadLength=AverageReadLength/nreads; bamFile.Close(); //calculates the coverage and returns the coverage within the window coverage=bases/float(WindowLength+1); - statisticsVector.push_back(coverage);statisticsVector.push_back(linksFromWindow); + statisticsVector.push_back(coverage);statisticsVector.push_back(linksFromWindow);statisticsVector.push_back(AverageReadLength); return(statisticsVector); } @@ -219,6 +293,8 @@ bool Window::computeVariations(int chr2) { long stopSecondWindow=chr2regions[i*3+1]; long pairsFormingLink=chr2regions[i*3+2]; + + //resize region so that it is just large enough to cover the reads that have mates in the present cluster vector chrALimit=newChrALimit(this-> eventReads[chr2],startSecondWindow,stopSecondWindow); @@ -226,8 +302,10 @@ bool Window::computeVariations(int chr2) { long startchrA=chrALimit[0]; long stopchrA=chrALimit[1]; - int numLinksToChr2=findLinksToChr2(eventReads[chr2],startchrA, stopchrA); - + vector statsOnB=findLinksToChr2(eventReads[chr2],startchrA, stopchrA,startSecondWindow,stopSecondWindow,pairsFormingLink); + int numLinksToChr2=statsOnB[0]; + int estimatedDistance=statsOnB[1]; + vector orientation=computeOrientation(eventReads[chr2],startchrA ,stopchrA,startSecondWindow,stopSecondWindow); string read1_orientation= orientation[0]; string read2_orientation= orientation[1]; @@ -236,27 +314,53 @@ bool Window::computeVariations(int chr2) { vector statisticsFirstWindow =computeStatisticsA(bamFileName, eventReads[chr2].front().RefID, startchrA, stopchrA, (stopchrA-startchrA), indexFile); double coverageRealFirstWindow = statisticsFirstWindow[0]; int linksFromWindow=int(statisticsFirstWindow[1]); - double coverageRealSecondWindow=computeCoverageB(chr2, startSecondWindow, stopSecondWindow, (stopSecondWindow-startSecondWindow) ); - - //I need to find the window that maximise this + int averageReadLength=int(statisticsFirstWindow[2]); + double coverageRealSecondWindow=computeCoverageB(chr2, startSecondWindow, stopSecondWindow, (stopSecondWindow-startSecondWindow+1) ); + if(pairsFormingLink >= minimumPairs ) { //ration between coverage // and coverageRealFirstWindow < 5*this->meanCoverage - - + //I need to find the window that maximise this + int secondWindowLength=(stopSecondWindow-startSecondWindow+1); + int firstWindowLength=stopchrA-startchrA+1; + float expectedLinksInWindow = ExpectedLinks(firstWindowLength, secondWindowLength, estimatedDistance, mean_insert, std_insert, coverageRealFirstWindow, averageReadLength); if(this->chr == chr2) { intraChrVariations << position2contig[this->chr] << "\t" << startchrA << "\t" << stopchrA << "\t" ; intraChrVariations << position2contig[chr2] << "\t" << startSecondWindow << "\t" << stopSecondWindow << "\t" ; intraChrVariations << linksFromWindow << "\t" << numLinksToChr2 << "\t" << pairsFormingLink << "\t"; intraChrVariations << coverageRealFirstWindow << "\t" << coverageRealSecondWindow << "\t"; - intraChrVariations << read1_orientation << "\t" << read2_orientation << "\n" ; - //expectedLinksInWindow << "\t" << pairsFormingLink/expectedLinksInWindow << "\t" << estimatedDistance << "\n"; + intraChrVariations << read1_orientation << "\t" << read2_orientation << "\t" + << expectedLinksInWindow << "\t" << pairsFormingLink/expectedLinksInWindow << "\t" << estimatedDistance << "\n"; + + string filter=filterFunction(pairsFormingLink/expectedLinksInWindow, pairsFormingLink, linksFromWindow,mean_insert, std_insert,estimatedDistance); + string svType="BND"; + + intraChrVariationsVCF << position2contig[this -> chr] << "\t" << stopchrA << "\t.\t" ; + intraChrVariationsVCF << "N" << "\t" << "N[" << position2contig[chr2] << ":" << startSecondWindow << "["; + intraChrVariationsVCF << "\t.\t" << filter << "\tSVTYPE="+svType <<";CHRA="<chr]<<";WINA=" << startchrA << "," << stopchrA; + intraChrVariationsVCF <<";CHRB="<< position2contig[chr2] <<";WINB=" << startSecondWindow << "," << stopSecondWindow << ";LFW=" << linksFromWindow; + intraChrVariationsVCF << ";LCB=" << numLinksToChr2 << ";LTE=" << pairsFormingLink << ";COVA=" << coverageRealFirstWindow; + intraChrVariationsVCF << ";COVB=" << coverageRealSecondWindow << ";OA=" << read1_orientation << ";OB=" << read2_orientation; + intraChrVariationsVCF << ";EL=" << expectedLinksInWindow << ";RATIO="<< pairsFormingLink/expectedLinksInWindow <<";ED=" << estimatedDistance << "\n"; + } else { interChrVariations << position2contig[this -> chr] << "\t" << startchrA << "\t" << stopchrA << "\t" ; interChrVariations << position2contig[chr2] << "\t" << startSecondWindow << "\t" << stopSecondWindow << "\t" ; interChrVariations << linksFromWindow << "\t" << numLinksToChr2 << "\t" << pairsFormingLink << "\t"; interChrVariations << coverageRealFirstWindow << "\t" << coverageRealSecondWindow << "\t"; - interChrVariations << read1_orientation << "\t" << read2_orientation << "\n" ; - //expectedLinksInWindow << "\t" << pairsFormingLink/expectedLinksInWindow << "\t" << estimatedDistance << "\n"; + interChrVariations << read1_orientation << "\t" << read2_orientation << "\t" + << expectedLinksInWindow << "\t" << pairsFormingLink/expectedLinksInWindow << "\t" << estimatedDistance << "\n"; + + string filter=filterFunction(pairsFormingLink/expectedLinksInWindow, pairsFormingLink, linksFromWindow,mean_insert, std_insert,estimatedDistance); + string svType="BND"; + + interChrVariationsVCF << position2contig[this -> chr] << "\t" << stopchrA << "\t.\t" ; + interChrVariationsVCF << "N" << "\t" << "N[" << position2contig[chr2] << ":" << startSecondWindow << "["; + interChrVariationsVCF << "\t.\t" << filter << "\tSVTYPE="+svType <<";CHRA="<chr]<<";WINA=" << startchrA << "," << stopchrA; + interChrVariationsVCF <<";CHRB="<< position2contig[chr2] <<";WINB=" << startSecondWindow << "," << stopSecondWindow << ";LFW=" << linksFromWindow; + interChrVariationsVCF << ";LCB=" << numLinksToChr2 << ";LTE=" << pairsFormingLink << ";COVA=" << coverageRealFirstWindow; + interChrVariationsVCF << ";COVB=" << coverageRealSecondWindow << ";OA=" << read1_orientation << ";OB=" << read2_orientation; + interChrVariationsVCF << ";EL=" << expectedLinksInWindow << ";RATIO="<< pairsFormingLink/expectedLinksInWindow <<";ED=" << estimatedDistance << "\n"; + } found = true; @@ -432,3 +536,4 @@ vector Window::computeOrientation(queue alignmentQueue,lon + diff --git a/src/data_structures/Translocation.h b/src/data_structures/Translocation.h index a1c9609..014ec40 100644 --- a/src/data_structures/Translocation.h +++ b/src/data_structures/Translocation.h @@ -48,6 +48,8 @@ class Window { string outputFileHeader; ofstream interChrVariations; ofstream intraChrVariations; + ofstream interChrVariationsVCF; + ofstream intraChrVariationsVCF; Window(int max_insert, uint16_t minimum_mapping_quality, diff --git a/src/data_structures/findTranslocationsOnTheFly.cpp b/src/data_structures/findTranslocationsOnTheFly.cpp index 97b0ff9..660a674 100644 --- a/src/data_structures/findTranslocationsOnTheFly.cpp +++ b/src/data_structures/findTranslocationsOnTheFly.cpp @@ -48,6 +48,8 @@ void StructuralVariations::findTranslocationsOnTheFly(string bamFileName, int32_ } window->interChrVariations.close(); window->intraChrVariations.close(); + window->interChrVariationsVCF.close(); + window->intraChrVariationsVCF.close(); }