From 9a439a4d10634c39864040a0cbd7842d8acd3c38 Mon Sep 17 00:00:00 2001 From: "Molik, David" Date: Tue, 8 Dec 2015 14:31:19 -0500 Subject: [PATCH] extra files that didn't need to be there --- src/Constants.h | 0 src/GeneFeatures.cpp.bak | 1092 -------------------------------------- src/GeneFeatures.h | 0 src/IntervalTree.h | 0 src/parseBAM.cpp.bak | 905 ------------------------------- src/parseBAM.cpp.bak2 | 857 ------------------------------ 6 files changed, 2854 deletions(-) delete mode 100644 src/Constants.h delete mode 100644 src/GeneFeatures.cpp.bak delete mode 100644 src/GeneFeatures.h delete mode 100644 src/IntervalTree.h delete mode 100644 src/parseBAM.cpp.bak delete mode 100644 src/parseBAM.cpp.bak2 diff --git a/src/Constants.h b/src/Constants.h deleted file mode 100644 index e69de29..0000000 diff --git a/src/GeneFeatures.cpp.bak b/src/GeneFeatures.cpp.bak deleted file mode 100644 index 64276db..0000000 --- a/src/GeneFeatures.cpp.bak +++ /dev/null @@ -1,1092 +0,0 @@ -// -// GeneFeatures.cpp -// BAMQC-0.5 -// -// Created by Ying Jin on 9/15/15. -// Copyright (c) 2015 Ying Jin. All rights reserved. -// - -#include "GeneFeatures.h" - -#include -#include -#include -//#include -#include "stdlib.h" -#include -#include -#include -#include - - -//#include - -template > -struct sort_pair_second { - bool operator()(const std::pair&left, const std::pair&right) { - Pred p; - return p(left.first, right.first); - } -}; - - -bool itv_comp(Interval first, Interval second){ - return first.start < second.start ; -} - -int pivot(std::vector &intervals, int first, int last) -{ - int p = first; - int pivotElement = intervals[first].start; - - - for(int i = first+1 ; i <= last ; i++) - { - /* If you want to sort the list in the other order, change "<=" to ">" */ - if(intervals[i].start <= pivotElement) - { - std::swap(intervals[i],intervals[p]); - p++; - - } - } - - return p; -} - -void quick_sort(std::vector &intervals, int first, int last){ - - int pivotElement; - - if(first < last) - { - pivotElement = pivot(intervals, first, last); - quick_sort(intervals, first, pivotElement-1); - quick_sort(intervals, pivotElement+1, last); - } - -} - - -bool reverse_ord_func (int i,int j) { return (j::max(); - max_stop = std::numeric_limits::min(); - gene_actual_len = 0; - stop_codon_st = -1; - stop_codon_end = -1; -} - -Gene::~Gene(){} - -void Gene::add_cds(int st, int end){ - std::pair cds_interval (st,end); - cds.push_back(cds_interval); -} - -void Gene::add_exons(int st, int end){ - if (this->min_start > st) { - this->min_start = st; - } - if (this->max_stopmax_stop = end; - } - std::pair exon_interval (st,end); - gene_actual_len += (end - st+1); - exons.push_back(exon_interval); -} -void Gene::set_stop_codon(int st,int end) -{ - stop_codon_st = st; - stop_codon_end = end; -} - - -void Gene::get_others(){ - std::vector > left_cds; - std::vector > left_exons; - //int idx[exons.size()]; - size_t i; //,j; - int itgUp1k_st, itgUp1k_end,itgDn1k_st,itgDn1k_end ; - - sort(exons.begin(),exons.end()); - sort(cds.begin(),cds.end()); - - - /* for (i=0;i < cds.size(); i++) { - for (j=0; j < exons.size(); j++) { - if (cds[i].first == exons[j].first && cds[i].second == exons[j].second) { - idx[j] = 1; - break; - } - } - if (j == exons.size()) { - left_cds.push_back(cds[i]); - } - } - - for (i=0; i cds[0].first ) { utr5.push_back(std::make_pair(exons[i].first,cds[0].first - 1)); } - - if (exons[i].first <= stop_codon_st && exons[i].second > stop_codon_end ) - { utr3.push_back(std::make_pair(stop_codon_end + 1,exons[i].second)); } - if (exons[i].first > stop_codon_end ) { utr3.push_back(exons[i]) ; } - - } - } - - } - else { - itgDn1k_st = std::max(int(0),exons[0].first - 1000); - itgDn1k_end = std::max(int(0),exons[0].first -1); - itgUp1k_st = exons[exons.size()-1].second + 1 ; - itgUp1k_end = exons[exons.size()-1].second + 1000 ; - itg1k.push_back(std::make_pair(itgUp1k_st,itgUp1k_end)); - itg1k.push_back(std::make_pair(itgDn1k_st,itgDn1k_end)); - - if (stop_codon_st == -1 ) { utr3 = exons; } - else { - //if (left_cds.size() == 1) { - for( i=0;i < exons.size(); i++) { - - if (exons[i].second < stop_codon_st) { utr3.push_back(exons[i]); } - - if (exons[i].first < stop_codon_st && exons[i].second >= stop_codon_end ) - { utr3.push_back(std::make_pair(exons[i].first,stop_codon_st - 1)); } - - if (exons[i].first <= cds[cds.size()-1].first && exons[i].second > cds[cds.size()-1].second ) - { utr5.push_back(std::make_pair(cds[cds.size()-1].second + 1,exons[i].second)); } - - if (exons[i].first > cds[cds.size()-1].second ) { utr5.push_back(exons[i]) ; } - } - } - - } - -} - - - -GeneFeatures::GeneFeatures(std::string GTFfilename,std::string id_attribute) -{ - this->total_exon = 0; - read_features(GTFfilename,id_attribute); - -} - -GeneFeatures::~GeneFeatures(){ - chrom_itvTree_Dict_itr it; - - for (it=cds_exon_idx_plus.begin(); it != cds_exon_idx_plus.end(); it++) { - std::map tmp = it->second; - std::map::iterator tmp_itr ; - //std::cout << tmp.size() << std::endl; - for (tmp_itr = tmp.begin(); tmp_itr != tmp.end(); tmp_itr ++) { - delete tmp_itr->second; - } - - } - for (it=cds_exon_idx_minus.begin(); it != cds_exon_idx_minus.end(); it++) { - std::map tmp = it->second; - std::map::iterator tmp_itr ; - - for (tmp_itr = tmp.begin(); tmp_itr != tmp.end(); tmp_itr ++) { - delete tmp_itr->second; - } - - } - -} - -void GeneFeatures::build_tree(std::map > temp_plus, std::map > temp_minus) -{ - std::vector itemlist; - std::vector sublist; - std::map >::iterator it; - std::map tmp; - std::map::iterator tmp_itr; - gene_exon_Dict_It exon_str_itr; - - std::string chr,gid;//,start_ss,end_ss; - - int g_idx =-1; - int e_idx =0; - size_t i ; - int cur_bin_id,start_bin_id, js, je, k;//, buket_size; - - for (it = temp_plus.begin(); it != temp_plus.end(); it++) { - chr = it->first; - tmp = it->second; - itemlist.clear(); - - for (tmp_itr = tmp.begin(); tmp_itr != tmp.end(); tmp_itr++) { - gid = tmp_itr->first; - features.push_back(gid); //save gene name - g_idx +=1; - Gene tmp_gene = tmp_itr->second; - tmp_gene.get_others(); - - //int gene_len = (int) tmp_gene.max_stop - tmp_gene.min_start + 1; - //std::cout << gene_len << std::endl; - - std::vector gene_base_pos ; - - for (i=0; i< tmp_gene.exons.size(); i++) { - int st = (int) tmp_gene.exons[i].first - tmp_gene.min_start; - int end = (int) tmp_gene.exons[i].second - tmp_gene.min_start; - for (int j=st; j<=end; j++) { - gene_base_pos.push_back(j); - } - } - gene_lengths.push_back(tmp_gene.gene_actual_len); - gene_starts.push_back(tmp_gene.min_start); - gene_ends.push_back(tmp_gene.max_stop); - - std::vector percentile_list; - std::sort(gene_base_pos.begin(),gene_base_pos.end()); - - for (int j=0; j<=100;j++) { - float kk = (tmp_gene.gene_actual_len - 1) * j/100.0; - float f = floor(kk); - float c = ceil(kk); - if (f == c){ - percentile_list.push_back( gene_base_pos[kk]); - } - else{ - float d0 = gene_base_pos[int(f)] * (c-kk); - float d1 = gene_base_pos[int(c)] * (kk-f); - percentile_list.push_back(int(round(d0+d1))); - } - } - gene_percentile_list.push_back(percentile_list); - - for (i=0; i < tmp_gene.cds.size(); i++) { - this->total_exon ++ ; - //std::cout<< this->total_exon << std::endl; - itemlist.push_back(Interval(g_idx,e_idx,tmp_gene.cds[i].first,tmp_gene.cds[i].second,CDS)); - e_idx ++; - } - for (i=0; i < tmp_gene.utr5.size(); i++) { - this->total_exon ++ ; - itemlist.push_back(Interval(g_idx,e_idx,tmp_gene.utr5[i].first,tmp_gene.utr5[i].second,UTR5)); - e_idx ++; - } - - for (i=0; i < tmp_gene.utr3.size(); i++) { - this->total_exon ++ ; - std::cout << chr << "\t" << tmp_gene.utr3[i].first << "\t" << tmp_gene.utr3[i].second << "\t" << gid << std::endl; - itemlist.push_back(Interval(g_idx,e_idx,tmp_gene.utr3[i].first,tmp_gene.utr3[i].second,UTR3)); - e_idx ++; - } - for (i=0; i < tmp_gene.intron.size(); i++) { - itemlist.push_back(Interval(g_idx,-1,tmp_gene.intron[i].first,tmp_gene.intron[i].second,INTRON)); - } - itemlist.push_back(Interval(g_idx,-1,tmp_gene.itg1k[0].first,tmp_gene.itg1k[0].second,ITGUP1K)); - itemlist.push_back(Interval(g_idx,-1,tmp_gene.itg1k[1].first,tmp_gene.itg1k[1].second,ITGDN1K)) ; - - } - - std::sort(itemlist.begin(),itemlist.end(),itv_comp) ; //key=operator.attrgetter('start')); - //quick_sort(itemlist,0,itemlist.size()-1); - start_bin_id = itemlist[0].start/BIN_SIZE; - js = 0 ; - je = 0 ; - k = 0 ; - - - for (i=0; i < itemlist.size(); i++) { - cur_bin_id = itemlist[i].start/BIN_SIZE; - //std::cout << cur_bin_id << std::endl; - - if (cur_bin_id == start_bin_id) { - je += 1; - } - else { - //buket_size = (int)sqrt(je - js) + 1; - //std::cout << buket_size << std::endl; - - sublist = std::vector(itemlist.begin()+js,itemlist.begin() + je); - - cds_exon_idx_plus[chr][start_bin_id] = new IntervalTree(sublist); - // std::cout << start_bin_id << " built one tree." << std::endl; - k ++; - start_bin_id = cur_bin_id ; - js = je ; - je ++; - } - } - - - if (js != je) { - //buket_size = (int) sqrt(je - js) + 1; - sublist = std::vector(itemlist.begin()+js,itemlist.begin() + je); - cds_exon_idx_plus[chr][start_bin_id] = new IntervalTree(sublist); - //print("tree depth = " + str(cds_exon_idx_plus[chr][start_bin_id].get_depth())+ "\n") - k+=1; - } - - - } - //std::cout << " build minus strand." << std::endl; - for (it = temp_minus.begin(); it != temp_minus.end(); it++) { - //std::cout << "negative strand " << std::endl; - chr = it->first; - tmp = it->second; - itemlist.clear(); - - for (tmp_itr = tmp.begin(); tmp_itr != tmp.end(); tmp_itr++) { - gid = tmp_itr->first; - //std::cout << "GID " << gid << std::endl; - Gene tmp_gene = tmp_itr->second; - tmp_gene.get_others(); - - features.push_back(gid); //save gene name - g_idx +=1; - - //std::string bs_string = ""; - //int gene_len = (int) tmp_gene.max_stop - tmp_gene.min_start + 1; - - //std::vector > bs_list; - std::vector gene_base_pos ; - - /*for (int j=0; j< gene_len; j+=100) { - std::bitset<100> bs; - bs_list.push_back(bs); - }*/ - - for (i=0; i< tmp_gene.exons.size(); i++) { - int st = (int) tmp_gene.exons[i].first - tmp_gene.min_start; - int end = (int) tmp_gene.exons[i].second - tmp_gene.min_start; - //int size = end - st; - - for (int j=st; j<=end; j++) { - gene_base_pos.push_back(j); - } - - } - gene_lengths.push_back(tmp_gene.gene_actual_len); - gene_starts.push_back(tmp_gene.min_start); - gene_ends.push_back(tmp_gene.max_stop); - - std::vector percentile_list; - std::sort(gene_base_pos.begin(),gene_base_pos.end(),reverse_ord_func); - - for (int j=0; j<=100;j++) { - float kk = (tmp_gene.gene_actual_len - 1) * j/100.0; - float f = floor(kk); - float c = ceil(kk); - if (f == c){ - percentile_list.push_back( gene_base_pos[kk]); - } - else{ - float d0 = gene_base_pos[int(f)] * (c-kk); - float d1 = gene_base_pos[int(c)] * (kk-f); - percentile_list.push_back(int(round(d0+d1))); - } - } - gene_percentile_list.push_back(percentile_list); - - - for (i=0; i < tmp_gene.cds.size(); i++) { - total_exon += 1; - itemlist.push_back(Interval(g_idx,e_idx,tmp_gene.cds[i].first,tmp_gene.cds[i].second,CDS)); - e_idx ++; - } - for (i=0; i < tmp_gene.utr5.size(); i++) { - total_exon += 1; - itemlist.push_back(Interval(g_idx,e_idx,tmp_gene.utr5[i].first,tmp_gene.utr5[i].second,UTR5)); - e_idx ++; - } - for (i=0; i < tmp_gene.utr3.size(); i++) { - total_exon += 1; - itemlist.push_back(Interval(g_idx,e_idx,tmp_gene.utr3[i].first,tmp_gene.utr3[i].second,UTR3)); - e_idx ++; - } - for (i=0; i < tmp_gene.intron.size(); i++) { - itemlist.push_back(Interval(g_idx,-1,tmp_gene.intron[i].first,tmp_gene.intron[i].second,INTRON)); - } - itemlist.push_back(Interval(g_idx,-1,tmp_gene.itg1k[0].first,tmp_gene.itg1k[0].second,ITGUP1K)); - itemlist.push_back(Interval(g_idx,-1,tmp_gene.itg1k[1].first,tmp_gene.itg1k[1].second,ITGDN1K)) ; - - } - - //std::sort(key=operator.attrgetter('start')); - //quick_sort(itemlist,0,itemlist.size()-1); - std::sort(itemlist.begin(),itemlist.end(),itv_comp) ; - start_bin_id = itemlist[0].start/BIN_SIZE; - js = 0 ; - je = 0 ; - k = 0 ; - - for (i=0; i < itemlist.size(); i++) { - cur_bin_id = itemlist[i].start/BIN_SIZE; - if (cur_bin_id == start_bin_id) { - je += 1; - } - else { - //buket_size = (int)sqrt(je - js) + 1; - // std::cout << buket_size << std::endl; - - - sublist = std::vector(itemlist.begin()+js,itemlist.begin() + je); - //cds_exon_idx_minus[chr][start_bin_id] = new IntervalTree(sublist,16,buket_size,-1,-1,buket_size); - cds_exon_idx_minus[chr][start_bin_id] = new IntervalTree(sublist); - k ++; - start_bin_id = cur_bin_id ; - js = je ; - je ++; - } - } - - - if (js != je) { - //buket_size = (int) sqrt(je - js) + 1; - //std::cout << buket_size << std::endl; - - sublist = std::vector(itemlist.begin()+js,itemlist.begin() + je); - //cds_exon_idx_minus[chr][start_bin_id] = new IntervalTree(sublist, 16, buket_size,-1,-1,buket_size ); - cds_exon_idx_minus[chr][start_bin_id] = new IntervalTree(sublist); - //print("tree depth = " + str(cds_exon_idx_plus[chr][start_bin_id].get_depth())+ "\n") - k+=1; - } - - - } - -} -//Reading & processing annotation files -void GeneFeatures::read_features(std::string gff_filename, std::string id_attribute) -{ - - //dict of dicts since the builtin type doesn't support it for some reason - std::map > temp_plus ; - std::map > temp_minus ; - std::map >::iterator tmp_itr; - std::map::iterator id_itr; - - bool matched = false; - //int k = 0; - int i = 0; - int counts = 0 ; - int line_no = 0; - int start = -1; - int end = -1; - std::size_t pos,cur_pos; - //std::string left_str,sub_str; - - std::ifstream input; //(gff_filename); - - try{ - input.open (gff_filename, std::ifstream::in); - - while(! input.eof()){ - - std::string line,chrom,source,feature,start_ss,end_ss,score,strand,frame,attributeStr; - std::stringstream ss; - std::string id = ""; - - if (! std::getline(input,line)){ - break; - } - - line_no ++; - - if (line == "\n" || !line.compare(0,1,"#")) { - continue; - } - - ss << line; - std::getline(ss,chrom,'\t'); - - std::getline(ss,source,'\t'); - std::getline(ss,feature,'\t'); - std::getline(ss,start_ss,'\t'); - std::getline(ss,end_ss,'\t'); - std::getline(ss,score,'\t'); - std::getline(ss,strand,'\t'); - std::getline(ss,frame,'\t'); - std::getline(ss,attributeStr,'\t'); - - //std::cout << strand << std::endl; - try{ - start = std::stol(start_ss); - end = std::stol(end_ss); - } - catch (const std::invalid_argument& ia) { - std::cerr << "Invalid argument: " << ia.what() << '\n'; - std::exit(1); - - } - - cur_pos = 0; - //std::cout <second); - - (*g).set_stop_codon(start,end); - } - else{ - Gene g (id,strand); - g.set_stop_codon(start,end); - temp_plus[chrom].insert(std::pair(id,g)); - } - } - else{ - Gene g(id,strand); - g.set_stop_codon(start,end); - std::map gene_id_map ; - gene_id_map.insert(std::pair (id,g)); - temp_plus.insert(std::pair > (chrom,gene_id_map)); - - } - - } - - if (strand == "-" ) { - tmp_itr = temp_minus.find(chrom); - if (tmp_itr != temp_minus.end()) { - id_itr = temp_minus[chrom].find(id); - if (id_itr != temp_minus[chrom].end()) { - Gene *g = &(id_itr->second); - (*g).set_stop_codon(start,end); - } - else{ - Gene g(id,strand); - g.set_stop_codon(start,end); - std::map gene_id_map ; - temp_minus[chrom].insert(std::pair(id,g)); - - } - } - else{ - Gene g(id,strand); - g.set_stop_codon(start,end); - std::map gene_id_map ; - gene_id_map.insert(std::pair (id,g)); - - temp_minus.insert(std::pair > (chrom,gene_id_map)); - - - } - - } - - } - if (feature == "CDS" ){ - if (strand == "+" ){ - tmp_itr = temp_plus.find(chrom); - if (tmp_itr != temp_plus.end()) { - id_itr = temp_plus[chrom].find(id); - if (id_itr != temp_plus[chrom].end()) { - Gene *g = &(id_itr->second); - //(id_itr->second).add_cds(start,end); - (*g).add_cds(start,end); - } - else{ - Gene g (id,strand); - g.add_cds(start,end); - temp_plus[chrom].insert(std::pair(id,g)); - } - } - else{ - Gene g(id,strand); - g.add_cds(start,end); - std::map gene_id_map ; - gene_id_map.insert(std::pair (id,g)); - temp_plus.insert(std::pair > (chrom,gene_id_map)); - - } - - } - - if (strand == "-" ) { - - tmp_itr = temp_minus.find(chrom); - if (tmp_itr != temp_minus.end()) { - id_itr = temp_minus[chrom].find(id); - if (id_itr != temp_minus[chrom].end()) { - Gene *g = &(id_itr->second); - (*g).add_cds(start,end); - } - else{ - Gene g(id,strand); - g.add_cds(start,end); - std::map gene_id_map ; - temp_minus[chrom].insert(std::pair(id,g)); - - } - } - else{ - Gene g(id,strand); - g.add_cds(start,end); - std::map gene_id_map ; - gene_id_map.insert(std::pair (id,g)); - - temp_minus.insert(std::pair > (chrom,gene_id_map)); - - - } - - } - } - - if (feature == "exon" ){ - counts += 1 ; - if (strand == "+" ){ - tmp_itr = temp_plus.find(chrom); - if (tmp_itr != temp_plus.end()) { - id_itr = temp_plus[chrom].find(id); - if (id_itr != temp_plus[chrom].end()) { - Gene *g = &(id_itr->second); - (*g).add_exons(start,end); - } - else{ - Gene g (id,strand); - g.add_exons(start,end); - - temp_plus[chrom].insert(std::pair(id,g)); - } - } - else{ - Gene g (id,strand); - g.add_exons(start,end); - std::map gene_id_map ; - - gene_id_map.insert(std::pair (id,g)); - temp_plus.insert(std::pair > (chrom,gene_id_map)); - - } - - } - - if (strand == "-" ) { - - tmp_itr = temp_minus.find(chrom); - if (tmp_itr != temp_minus.end()) { - - id_itr = temp_minus[chrom].find(id); - if (id_itr != temp_minus[chrom].end()) { - Gene *g = &(id_itr->second); - (*g).add_exons(start,end); - } - else{ - Gene g(id,strand); - g.add_exons(start,end); - - temp_minus[chrom].insert(std::pair(id,g)); - - } - } - else{ - Gene g (id,strand); - g.add_exons(start,end); - std::map gene_id_map ; - gene_id_map.insert(std::pair (id,g)); - - temp_minus.insert(std::pair > (chrom,gene_id_map)); - - } - } - - } - - i += 1 ; - //if (i % 100000 == 0 ) - //{ - //sys.stderr.write("%d GTF lines processed.\n" % i); - //std::cout << i << " GTF lines processed." << std::endl; - //} - - } - - - input.close(); - - } - catch(std::ifstream::failure e){ - std::cout << "error in read file " << gff_filename << std::endl; - } - - if (counts == 0 ){ - std::cout << "Warning: No features of type 'exon' or 'CDS' found in gene GTF file." << std::endl; - } - - - build_tree(temp_plus,temp_minus); - - -} - -//find exons of given gene that overlap with the given intervals -//return list of tuples -std::vector > GeneFeatures::get_exons(std::string chrom,int st,int end,std::string strand) -{ - std::vector > exons; - chrom_itvTree_Dict::iterator chrom_it; - std::map::iterator bin_iter; - std::vector fs ; - std::vector temp ; - size_t i; - int bin_id ; - - //try: - bin_id = st/BIN_SIZE ; - - if (strand == "+" || strand == "."){ - chrom_it = cds_exon_idx_plus.find(chrom); - if (chrom_it != cds_exon_idx_plus.end()) { - bin_iter = cds_exon_idx_plus[chrom].find(bin_id); - if (bin_iter != cds_exon_idx_plus[chrom].end()) { - fs = (*cds_exon_idx_plus[chrom][bin_id]).find(st,end); - } - } - } - - if (strand == "-" || strand == "."){ - chrom_it = cds_exon_idx_minus.find(chrom); - if (chrom_it != cds_exon_idx_minus.end()) { - bin_iter = cds_exon_idx_minus[chrom].find(bin_id); - if (bin_iter != cds_exon_idx_minus[chrom].end()) { - temp = (*cds_exon_idx_minus[chrom][bin_id]).find(st,end); - fs.insert(fs.end(),temp.begin(),temp.end()); - } - } - } - - for(i =0 ; i < fs.size(); i++){ - if (fs[i].type != INTRON && fs[i].type != ITGUP1K && fs[i].type != ITGDN1K){ - int s = fs[i].start; - int e = fs[i].stop; - - if (s < st) { - s = st; - } - if (e > end) { - e = end; - } - exons.push_back(std::make_pair(s,e)); - } - } - - //std::sort(exons.begin(),exons.end()); - return exons; -} - -std::vector GeneFeatures::getFeatures() { - return features ; -} - -int GeneFeatures::get_start(int g) -{ - if ((size_t) g < gene_starts.size() && g >=0) { - return gene_starts[g]; - } - else{ - return -1; - } -} - -int GeneFeatures::get_stop(int g) -{ - if ((size_t)g < gene_ends.size() && g >=0) { - return gene_ends[g]; - } - else{ - return -1; - } -} - -std::string GeneFeatures::get_name(int g) -{ - if ((size_t)g < features.size() && g >=0) { - return features[g]; - } - else{ - return ""; - } - -} -int GeneFeatures::get_numofgenes(){ - return features.size(); -} - -int GeneFeatures::exist_in_percentile_list(int gene,int pos){ - int idx = (int)pos - gene_starts[gene]; - for (int i=0; i<=100; i++) { - - if (idx == gene_percentile_list[gene][i]) { - return i; - } - - } - return -1; -} - - - -std::map GeneFeatures::Gene_annotation(std::string chrom, std::vector > itv_list,std::string strand,std::vector * mapped_exons ) -{ - //std::vector > genes; - std::vector fs ; - // std::vector tmp ; - chrom_itvTree_Dict::iterator chrom_it; - std::map::iterator bin_iter; - - int type=-1 ; - size_t i; - int bin_id_s, bin_id_e; - ///bool isCDS = false; - //bool isINTRON = false; - //bool isUTR3 = false; - //bool isUTR5 = false; - //bool isITGup = false; - //bool isITGdn = false; - - for (i=0; i < itv_list.size(); i ++) { - bin_id_s = itv_list[i].first/BIN_SIZE; - bin_id_e = itv_list[i].second/BIN_SIZE; - - if (strand == "+" || strand == ".") { - chrom_it = cds_exon_idx_plus.find(chrom); - if (chrom_it != cds_exon_idx_plus.end()) { - - bin_iter = cds_exon_idx_plus[chrom].find(bin_id_s); - if (bin_iter != cds_exon_idx_plus[chrom].end() ) - { - fs = (*cds_exon_idx_plus[chrom][bin_id_s]).find(itv_list[i].first,itv_list[i].second); - if (bin_id_s != bin_id_e) { - bin_iter = cds_exon_idx_plus[chrom].find(bin_id_e); - if (bin_iter != cds_exon_idx_plus[chrom].end() ){ - std::vector tmp = (*cds_exon_idx_plus[chrom][bin_id_e]).find(itv_list[i].first,itv_list[i].second); - fs.insert(fs.end(),tmp.begin(),tmp.end()); - } - } - } - } - } - - if (strand == "-" or strand == ".") { - chrom_it = cds_exon_idx_minus.find(chrom); - if (chrom_it != cds_exon_idx_minus.end()) { - bin_iter = cds_exon_idx_minus[chrom].find(bin_id_s); - if (bin_iter != cds_exon_idx_minus[chrom].end() ) - { - std::vector tmp = (*cds_exon_idx_minus[chrom][bin_id_s]).find(itv_list[i].first,itv_list[i].second); - fs.insert(fs.end(),tmp.begin(),tmp.end()); - - if (bin_id_s != bin_id_e) { - bin_iter = cds_exon_idx_minus[chrom].find(bin_id_e); - if (bin_iter != cds_exon_idx_minus[chrom].end() ){ - std::vector tmp = (*cds_exon_idx_minus[chrom][bin_id_e]).find(itv_list[i].first,itv_list[i].second); - fs.insert(fs.end(),tmp.begin(),tmp.end()); - } - } - } - } - } - } - //std::cout << fs.size() << std::endl; - /* for(i =0 ; i < fs.size(); i++){ - if (fs[i].type == CDS) { - isCDS = true ; - break; - } - if (fs[i].type == UTR5) { - isUTR5 = true; - } - if (fs[i].type == UTR3) { - isUTR3 = true; - } - if (fs[i].type == INTRON) { - isINTRON = true; - } - if (fs[i].type == ITGUP1K) { - isITGup = true; - } - if (fs[i].type == ITGDN1K) { - isITGdn = true; - } - - } - - if (isCDS) { - type = CDS; - } - else{ - if (isUTR5){ - type = UTR5; - } - else { if (isUTR3){ - type = UTR3; - } - else { if (isINTRON) { - type = INTRON; - } - else { if (isITGup) { - type = ITGUP1K; - } - else { if (isITGdn) { - type = ITGDN1K; - } - else { - type = INTERGENIC; - } - }}}}} - */ - - //genes.push_back(type); - std::map gene_type_map; - std::map::iterator gene_type_map_itr; - for (i=0; i < fs.size(); i++) { - //if (fs[i].type == type) { - // genes.push_back(fs[i].gene); - //} - gene_type_map_itr = gene_type_map.find(fs[i].gene); - if(gene_type_map_itr != gene_type_map.end()) - { - if(fs[i].type < gene_type_map[fs[i].gene]) { - gene_type_map[fs[i].gene] = fs[i].type; - } - } - else { - gene_type_map.insert(std::pair (fs[i].gene,fs[i].type)); - } - //genes.push_back(std::pair(fs[i].type,fs[i].gene)); - if (fs[i].exon != -1) { - mapped_exons->push_back(fs[i].exon); - } - } - return gene_type_map; - //return genes; - -} -/* - -int main() { - std::string filename = "test.gtf"; - std::string id_attr = "gene_id"; - - std::vector > itv_list ; - chr_ITV exp ; - exp.chrom = "chr4"; - int start = 1048489; - int end = 1049900; - std::pair p (start,end); - - itv_list.push_back(p); - - std::cout << "start to build tree " << std::endl; - GeneFeatures gIdx (filename,id_attr); - std::cout << "after build tree " << std::endl; - - std::cout << "total exon " << gIdx.total_exon << std::endl; - std::cout << "total gene " << gIdx.features.size() << std::endl; - - // for (int i=0; i < itv_list.size(); i++) { - // std::cout << itv_list[i].start << std::endl; - //} - std::vector *exons = new std::vector(); - std::vector res = gIdx.Gene_annotation("chr4",itv_list,".",exons); - std::vector > exon_list = gIdx.get_exons("chr4",start,end,"."); - for (auto& p : exon_list){ - std::cout << p.first << "\t" << p.second << std::endl; - } - std::vector > gene_percentile_list = gIdx.gene_percentile_list; - - for (int i=0; i perc_list = gene_percentile_list[i]; - for (int j=0; jsize() << std::endl; - - for (int i=0;ioperator[](i) << std::endl; - - } - delete exons; - - - bool test_bool = false; - std::cout << test_bool << std::endl; - -}*/ diff --git a/src/GeneFeatures.h b/src/GeneFeatures.h deleted file mode 100644 index e69de29..0000000 diff --git a/src/IntervalTree.h b/src/IntervalTree.h deleted file mode 100644 index e69de29..0000000 diff --git a/src/parseBAM.cpp.bak b/src/parseBAM.cpp.bak deleted file mode 100644 index 03712fc..0000000 --- a/src/parseBAM.cpp.bak +++ /dev/null @@ -1,905 +0,0 @@ -// -// parseBAM.cpp -// BAMQC_c++ -// -// Created by Ying Jin on 11/11/15. -// Copyright (c) 2015 Ying Jin. All rights reserved. -// - -#include "parseBAM.h" -#include "Coverage_prof.h" -#include "GeneFeatures.h" -#include "InnerDist_prof.h" -#include "Mappability.h" -#include "Results.h" -#include "rRNA.h" -#include "Constants.h" - -//#include -#include -#include -#include -//#include -#include -#include -#include - -#include "htslib/sam.h" - - - -unsigned int tick_time =3000; - -#define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED) -#define IS_QCFAIL(bam) ((bam)->core.flag & BAM_FQCFAIL) - -#define IS_PAIRED_AND_MAPPED(bam) (((bam)->core.flag&BAM_FPAIRED) && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP)) - -#define IS_PROPERLYPAIRED(bam) (((bam)->core.flag&(BAM_FPAIRED|BAM_FPROPER_PAIR)) == (BAM_FPAIRED|BAM_FPROPER_PAIR) && !((bam)->core.flag&BAM_FUNMAP)) - -#define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP) -#define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE) -#define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE) -#define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1) -#define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2) -#define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP) -//#defind READ_NAME(bam) (bam_get_qname(bam)) - -/*DEF BAM_CMATCH = 0 -DEF BAM_CINS = 1 -DEF BAM_CDEL = 2 -DEF BAM_CREF_SKIP = 3 -DEF BAM_CSOFT_CLIP = 4 -DEF BAM_CHARD_CLIP = 5 -DEF BAM_CPAD = 6 -DEF BAM_CEQUAL = 7 -DEF BAM_CDIFF = 8*/ - -typedef struct -{ - unsigned short thread_id; - pthread_t thread_object; - //pthread_spinlock_t cur_reads_lock; - pthread_spinlock_t cur_reads_lock; - - std::vector > *cur_reads; - - Results * res; - InnerDist_prof * inDist_prof; - Clipping_prof * clip_prof; - Coverage_prof * cov_prof; - -} thread_context_t; - -typedef struct -{ - unsigned short thread_number; - int all_finished; - std::string format; - std::string stranded; - //bam_hdr_t * header; - std::vector refnames; - std::vector thread_contexts; - - GeneFeatures * geneIdx; - rRNA * rRNAIdx; - -} global_context_t; - -struct arg_struct { - global_context_t * arg1; - thread_context_t * arg2; -}; -std::vector > fetch_intron(int st, uint32_t * cigar, uint32_t n_cigar,std::string format) -{ - //''' fetch intron regions defined by cigar. st must be zero based return list of tuple of (st, end)''' - //match = re.compile(r'(\d+)(\D)') - int chrom_st = st; - if (format == "BAM") { chrom_st += 1 ;} - - std::vector > intron_bound ; - - for (unsigned int i=0; i < n_cigar; i++){ //code and size - if (bam_cigar_op(cigar[i])==BAM_CMATCH) { chrom_st += bam_cigar_oplen(cigar[i]);} //match - if (bam_cigar_op(cigar[i])==BAM_CINS) {continue;} //insertion - if (bam_cigar_op(cigar[i])==BAM_CDEL) { chrom_st += bam_cigar_oplen(cigar[i]);} //deletion - if (bam_cigar_op(cigar[i])==BAM_CREF_SKIP) { intron_bound.push_back(std::pair (chrom_st,chrom_st+ bam_cigar_oplen(cigar[i]))); } //gap - if (bam_cigar_op(cigar[i])==BAM_CSOFT_CLIP) { chrom_st += bam_cigar_oplen(cigar[i]);}// soft clipping - } - - return intron_bound ; -} - -std::vector > fetch_exon(int st,uint32_t * cigar,uint32_t n_cigar,std::string format) -{ - //''' fetch exon regions defined by cigar. st must be zero based return list of tuple of (st, end)''' - //match = re.compile(r'(\d+)(\D)') - int chrom_st = st; - if (format == "BAM") { chrom_st += 1;} - - std::vector >exon_bound; - - for (unsigned int i=0; i < n_cigar; i++){ //code and size - if (bam_cigar_op(cigar[i])==BAM_CMATCH) { exon_bound.push_back(std::pair (chrom_st,chrom_st + bam_cigar_oplen(cigar[i])));} //match - if (bam_cigar_op(cigar[i])==BAM_CINS) {continue;} //insertion - if (bam_cigar_op(cigar[i])==BAM_CDEL) { chrom_st += bam_cigar_oplen(cigar[i]);} //deletion - if (bam_cigar_op(cigar[i])==BAM_CREF_SKIP) { chrom_st += bam_cigar_oplen(cigar[i]); } //gap - if (bam_cigar_op(cigar[i])==BAM_CSOFT_CLIP) { chrom_st += bam_cigar_oplen(cigar[i]);}// soft clipping - } - return exon_bound; -} - -//return flag and gene id -std::pair ovp_gene(GeneFeatures * geneIdx,std::string chrom1,std::vector >exon_blocks1,std::string chrom2,std::vector > exon_blocks2,std::string strand,std::vector * mapped_exons) -{ - - std::map res1 = geneIdx->Gene_annotation(chrom1,exon_blocks1,strand,mapped_exons); - std::map res2 = geneIdx->Gene_annotation(chrom2,exon_blocks2,strand,mapped_exons); - - if(res2.size() == 0) - { - if(res1.size ==0 ) { return std::pair(-1,-1);} - if(res1.size() == 1) { return std::pair(res1.begin()->second,res1.begin()->first);} - std::map > type_gene_list; - for(auto& kv : res1){ - if(type_gene_list.find(kv.second) != type_gene_list.end()) - { - type_gene_list[kv.second].push_back(kv.first); - } - if(kv.second < type) { type = kv.second;} - } - return std::pair(type,-1); - } - if(res1.size() == 0) - { - if(res2.size ==0 ) { return std::pair(-1,-1);} - if(res2.size() == 1) { return std::pair(res2.begin()->second,res2.begin()->first);} - if(res2.size()==2 && res2.find(-1) != res2.end()) - { - return std::pair(res2.begin()->second,res2.begin()->first); - } - if(res2.find(-1) != res2.end()) { - res2.erase(-1); - } - int type = 6; - for(auto& kv : res1){ - if(kv.second < type) { type = kv.second;} - } - return std::pair(type,-1); - } - //PE - if (res1.size() ==1 && res2.size() ==1 ){ - int g1 = res1.begin()->first; - int type1 = res1.begin()->second; - int g2 = res2.begin()->first; - int type2 = res2.begin()->second; - if(g1 == g2) { - if(type1 < type2) { return std::pair (type1,g1); } - else { return std::pair (type2,g1); } - }else { - if(type1 < type2) { return std::pair (type1,-1); } - else { return std::pair (type2,-1); } - } - } - if (res1.size() > 1 || res2.size() > 1 ) - { - std::vector ovp_genes ; - for(auto& kv : res1 ){ - if(res2.find(kv.first!=res2.end()) { - ovp_genes.push_back(kv.first); - } - } - if(ovp_genes.size() == 1) { - if(res1[ovp_genes[0]] < res2[ovp_genes[0]]) { return std::pair (res1[ovp_genes[0]],ovp_genes[0]); } - else { return std::pair (res2[ovp_genes[0]],ovp_genes[0]); } - } - - - } - - return std::pair (-1,-1); - -} - -void process_aligned_fragment(global_context_t * global_context,thread_context_t * thread_context,std::pair read_pair) -{ - - //std::cout << "start process" << std::endl; - bam1_t * cur_read1 = read_pair.first; - bam1_t * cur_read2 = read_pair.second; - std::string strand1 = "."; - std::string strand2 = "."; - int chrom1_id = -1; - int chrom2_id = -1; - std::vector > exons1 ; - std::vector > exons2 ; - std::vector > intron_blocks1 ; - std::vector > intron_blocks2 ; - std::string qname = ""; - std::string chrom1 = ""; - std::string chrom2 = ""; - - if(cur_read1 != NULL ) { - if(!IS_UNMAPPED(cur_read1)) - { - uint32_t *cigar = bam_get_cigar(cur_read1); - char * name = bam_get_qname(cur_read1); - // printf("read name %s \n", name); - thread_context-> clip_prof->set(cur_read1->core.l_qseq,cur_read1->core.n_cigar,cigar,cur_read1->core.qual); - // printf("after clip prof set %s \n", name); - chrom1_id = cur_read1->core.tid; - //p1 = global_context->header->target_name[cur_read1->core.tid]; - - qname = std::string(name); - //std::cout << "chromID " << chrom1_id << std::endl; - if((size_t)cur_read1->core.tid < global_context->refnames.size()){ - chrom1 = global_context->refnames[cur_read1->core.tid]; - } - else { - std::cout <<"missing reference sequences." << std::endl; - std::exit(1); - } - - //std::cout << chrom1 << std::endl; - strand1 = IS_REVERSE(cur_read1) ? "-" : "+"; - - if (global_context->stranded =="reverse") { - strand1 = (strand1 == "+") ? "-" : "+"; - } - exons1 = fetch_exon(cur_read1->core.pos, cigar,cur_read1->core.n_cigar, global_context->format); - intron_blocks1 = fetch_intron(cur_read1->core.pos,cigar,cur_read1->core.n_cigar,global_context->format); - } - } - if(cur_read2 != NULL){ - if(! IS_UNMAPPED(cur_read2)){ - uint32_t *cigar = bam_get_cigar(cur_read2); - // char * name = bam_get_qname(cur_read2); - // printf("read name %s \n", name); - thread_context->clip_prof->set(cur_read2->core.l_qseq,cur_read2->core.n_cigar,cigar,cur_read2->core.qual); - strand2 = IS_REVERSE(cur_read2) ? "-" : "+"; - - if (global_context-> stranded =="reverse") { - strand2 = ( strand2 == "+") ? "-" : "+"; - } - chrom2_id = cur_read2->core.tid; - if((size_t)cur_read2->core.tid < global_context->refnames.size()){ - chrom2 = global_context->refnames[cur_read2->core.tid]; - } - else { - std::cout <<"missing reference sequences." << std::endl; - std::exit(1); - } - exons2 = fetch_exon(cur_read2->core.pos, cigar,cur_read2->core.n_cigar, global_context->format); - intron_blocks2 = fetch_intron(cur_read2->core.pos,cigar,cur_read2->core.n_cigar,global_context->format); - } - } - - if (intron_blocks1.size() + intron_blocks2.size() == 0){ - thread_context->res->noSplice += 1; - } - else { - thread_context->res->splice += 1; - } - //paired read - if (chrom1_id != -1 && chrom2_id != -1){ - thread_context->res->paired_reads += 1; - - if (strand1 == "-" && strand2 == "-" ){ - thread_context->res->mapped_minus_minus += 1; - } - if (strand1 == "+" && strand2 == "+" ){ - thread_context->res->mapped_plus_plus += 1; - } - if (strand1 == "+" && strand2 == "-" ){ - thread_context->res->mapped_plus_minus += 1; - } - if (strand1 == "-" && strand2 == "+"){ - thread_context->res->mapped_minus_plus += 1; - } - if (chrom1_id != chrom2_id ){ - thread_context->res->paired_diff_chrom += 1; - if(cur_read1 != NULL){ - bam_destroy1(cur_read1); - } - if(cur_read2 != NULL){ - bam_destroy1(cur_read2); - } - return; - } - } - - if (cur_read1 == NULL || IS_UNMAPPED(cur_read1)) { - strand1 = (strand2 == "-") ? "+" : "-"; - } - std::string strand = strand1; - if (global_context->stranded == "no"){ - //strand1 = "."; - //strand2 = "."; - strand = "."; - } - //mapped read len - //rRNA read - //std::cout << "before is rRNA" << std::endl; - if (global_context->rRNAIdx != NULL){ - if (global_context->rRNAIdx->is_rRNA(chrom1,exons1,strand) || global_context->rRNAIdx->is_rRNA(chrom2,exons2,strand)){ - thread_context->res->rRNA_read += 1; - if(cur_read1 != NULL){ - bam_destroy1(cur_read1); - } - if(cur_read2 != NULL){ - bam_destroy1(cur_read2); - } - return; - } - } -// std::cout << "after is rRNA" << std::endl; - //cur_time = time.time() - std::vector * mapped_exons = new std::vector(); - - std::pair ovp = ovp_gene(global_context->geneIdx,chrom1,exons1,chrom2,exons2,strand,mapped_exons); - if(ovp.second != -1){ - if(global_context->geneIdx->get_name(ovp.second) == "NM_001144761") - { - std::cout << qname << std::endl; - } - } -// std::cout << "after ovp gene" << std::endl; - thread_context->cov_prof->count(ovp.second,exons1,exons2,*mapped_exons); -// std::cout << "after cov prof" << std::endl; - switch(ovp.first) - { - case CDS: - thread_context->res->cds_exon_read +=1; - break; - case UTR5: - thread_context->res->utr_5_read +=1; - break; - case UTR3: - thread_context->res->utr_3_read +=1; - break; - case INTRON: - thread_context->res->intron_read +=1; - break; - case ITGUP1K: - thread_context->res->intergenic_up1kb_read +=1; - break; - case ITGDN1K: - thread_context->res->intergenic_down1kb_read +=1; - break; - default: - thread_context->res->intergenic_read +=1; - break; - } - if(chrom1 != "" && chrom2 != ""){ - if( IS_PROPERLYPAIRED(cur_read1)){ - if (strand1 == "+" || strand1 == "."){ - if (global_context->stranded == "reverse") { - thread_context->inDist_prof->count(global_context->geneIdx,ovp.first,cur_read2,chrom2,intron_blocks2,strand1); - } - else{ - thread_context->inDist_prof->count(global_context->geneIdx,ovp.first,cur_read1,chrom1,intron_blocks1,strand1); - } - } - else { - if (global_context->stranded == "reverse" ){ - thread_context->inDist_prof->count(global_context->geneIdx,ovp.first,cur_read1,chrom1,intron_blocks1,strand1); - } - else { - thread_context->inDist_prof->count(global_context->geneIdx,ovp.first,cur_read2,chrom2,intron_blocks2,strand1); - } - } - } - } -// std::cout << "before delete mapped exon" << std::endl; - delete mapped_exons; - if(cur_read1 != NULL){ - bam_destroy1(cur_read1); - } - if(cur_read2 != NULL){ - bam_destroy1(cur_read2); - } -} - -void* worker(void * vargs) -{ - struct arg_struct * args = (struct arg_struct *) vargs; - thread_context_t * thread_context = args->arg2; - global_context_t * global_context = args->arg1; - delete args; - - while (1){ - std::pair cur_read_pair; - while(1){ - int is_retrieved = 0; - //std::cout << "start worker" << std::endl; - //std::cout << "num of threads " << global_context->thread_contexts.size() << std::endl; - //std::cout << "thread id " << thread_context->thread_id << std::endl; - pthread_spin_lock(&thread_context->cur_reads_lock); - - if(thread_context->cur_reads->size()>0){ - cur_read_pair = thread_context->cur_reads->back(); - thread_context->cur_reads->pop_back(); - - is_retrieved = 1; - // std::cout << "in worker retrieved" << std::endl; - } - pthread_spin_unlock(&thread_context->cur_reads_lock); - if(global_context->all_finished && !is_retrieved) return NULL; - - if(is_retrieved) break; - else usleep(tick_time); - } - process_aligned_fragment(global_context,thread_context,cur_read_pair); - - } -} - - -//start threads -void init_thread(global_context_t * global_context,unsigned short threadNumber,int mapq) -{ - global_context->thread_number = threadNumber; - //global_context -> thread_contexts = (thread_context_t * ) malloc(sizeof(thread_context_t) * global_context -> thread_number); - if(threadNumber >1){ - for(int i=0;icur_reads_lock, PTHREAD_PROCESS_PRIVATE); - th_contx->thread_id = i; - //std::cout << "spin lock init " << i << std::endl; - th_contx->cur_reads = new std::vector >() ; - - th_contx->res = new Results(); - th_contx->clip_prof = new Clipping_prof(mapq); - th_contx->cov_prof = new Coverage_prof(global_context->geneIdx); - th_contx->inDist_prof = new InnerDist_prof(); - global_context->thread_contexts.push_back(th_contx); - } - - for(int i=0;iarg1 = global_context; - args->arg2 = global_context->thread_contexts[i]; - //int ret = pthread_create(&global_context -> thread_contexts[i].thread_object, NULL, worker, (void *)&args); - pthread_create(&global_context -> thread_contexts[i]->thread_object, NULL, worker, (void *)args); - } - } - else { - thread_context_t *th_contx = new thread_context_t(); - //std::cout << "spin lock init " << i << std::endl; - th_contx->cur_reads = new std::vector >() ; - - th_contx->res = new Results(); - th_contx->clip_prof = new Clipping_prof(mapq); - th_contx->cov_prof = new Coverage_prof(global_context->geneIdx); - th_contx->inDist_prof = new InnerDist_prof(); - global_context->thread_contexts.push_back(th_contx); - - } - - return; -} - -//release memory -void destroy_thread_context(global_context_t * global_context) -{ - - for(int i=0; i < global_context-> thread_number; i++) - { - delete global_context->thread_contexts[i]->res; - delete global_context->thread_contexts[i]->cov_prof; - delete global_context->thread_contexts[i]->clip_prof; - delete global_context->thread_contexts[i]->cur_reads; - delete global_context->thread_contexts[i]->inDist_prof; - if(global_context->thread_number >1){ - pthread_spin_destroy(&global_context -> thread_contexts[i]->cur_reads_lock); - } - delete global_context -> thread_contexts[i]; - } - -} - -//join threads -void join_threads(global_context_t * global_context) -{ - int xk1; - for(xk1=0; xk1 thread_number; xk1++) - pthread_join(global_context -> thread_contexts[xk1]->thread_object, NULL); -} - -//merge results -void merge_results(global_context_t * global_context,Results * res,Clipping_prof * clip_prof,Coverage_prof * cov_prof, InnerDist_prof * inDist_prof) -{ - for(int i=0;ithread_number;i++) - { - res->add(global_context->thread_contexts[i]->res); - clip_prof->add(global_context->thread_contexts[i]->clip_prof); - cov_prof->add(global_context->thread_contexts[i]->cov_prof); - inDist_prof->add(global_context->thread_contexts[i]->inDist_prof); - } -} - -// parsing BAM file -Results QC(std::string smp_name,GeneFeatures * geneIdx, rRNA * rRNAIdx,char * inputFile,std::string outdir,std::string outdir_fig,std::string stranded,int threadNumber,int mapq) -{ - //'''This class provides fuctions to parsing SAM or BAM files and quality controls.''' - //'''constructor. input could be bam or sam''' - - //int max_mapQ = 0; - int q_cut = mapq; - //qc result - global_context_t global_context; - global_context.stranded = stranded; - global_context.all_finished = 0; - - //multi-thread - //pthread_t thread_object; - //threads = [] - //open SAM/BAM file - samFile *fp_in = NULL; - bam1_t * aligned_read = NULL; - //bam_header_t *bh = NULL; - bam_hdr_t *header = NULL; - - std::string format = "BAM"; - - fp_in = sam_open(inputFile, "rb"); - - if(NULL == fp_in) { - std::cout << "Could not open file " << inputFile << std::endl; - std::exit(1); - } - if(hts_get_format(fp_in)->format == sam) {format = "SAM";} - global_context.format = format; - - aligned_read = bam_init1(); - - Results main_res; - //read header - header = sam_hdr_read(fp_in); - if(NULL == header){ - std::cout << "No header information." << std::endl; - std::exit(1); - } - for(int i=0;i< header->n_targets;i++) - { - char * p = header->target_name[i]; - std::string chr(p); - // std::cout << chr << std::endl; - global_context.refnames.push_back(chr); - } - //return main_res; - - std::string prev_read_id = ""; - std::vector multi_read1; - std::vector multi_read2; - - std::vector alignments_per_read; - - - Clipping_prof main_clip_prof(outdir+smp_name,outdir_fig+smp_name,mapq); - Coverage_prof main_cov_prof(outdir+smp_name,outdir_fig+smp_name,geneIdx); - //ReadDup_prof main_rDup_prof(outdir,outdir_fig); - InnerDist_prof main_inDist_prof(outdir+smp_name,outdir_fig+smp_name); - - // int lineno = 0; - global_context.geneIdx = geneIdx; - global_context.rRNAIdx = rRNAIdx; - init_thread(&global_context, threadNumber,mapq); - //std::cout << "global context thread number" << global_context.thread_contexts.size()<< std::endl; - //std::cout << "start read file" << std::endl; - - int current_thread_id = 0; - while(sam_read1(fp_in, header,aligned_read) > 0) - { - std::string qname =""; - char * name = bam_get_qname(aligned_read); - if(name !=NULL) - { - qname = std::string(name); - } - //SE reads - if (! IS_PAIRED(aligned_read)) - { - //count unmapped read - if (IS_UNMAPPED(aligned_read)) - { - main_res.unmapped_reads += 1; - main_res.total_reads += 1; - bam_destroy1(aligned_read); - aligned_read = bam_init1(); - continue; - } - - if (qname == prev_read_id || prev_read_id == "") { - alignments_per_read.push_back(aligned_read); - prev_read_id = qname; - aligned_read = bam_init1(); - continue; - } - else { - bam1_t *cur_read = NULL; - if (alignments_per_read.size() == 1){ //unique read - main_res.uniq_mapped_reads += 1; - main_res.total_reads += 1; - - cur_read = alignments_per_read[0]; - - int skip_read = 0; - - if (cur_read->core.flag & BAM_FQCFAIL || cur_read->core.qual < q_cut){ //skip QC fail read - main_clip_prof.set_qual(cur_read->core.qual); - main_res.low_qual += 1; - skip_read = 1; - } - if (IS_DUP(cur_read)){ //skip duplicate read - skip_read = 1; - } - if (skip_read == 1) { - bam_destroy1(cur_read); - cur_read = NULL; - } - } - else { //multi reads - main_res.multi_mapped_reads += 1; - main_res.total_reads += 1; - cur_read = alignments_per_read[0]; - main_clip_prof.set_qual(cur_read->core.qual); - for(unsigned int i=0;i1){ - //thread_context_t * thread_context = global_context.thread_contexts + current_thread_id; - thread_context_t * thread_context = global_context.thread_contexts[current_thread_id]; - pthread_spin_lock(&thread_context->cur_reads_lock); - thread_context->cur_reads->push_back(std::pair(cur_read,NULL)); - pthread_spin_unlock(&thread_context->cur_reads_lock); - current_thread_id++; - if(current_thread_id >= global_context.thread_number) current_thread_id = 0; - } - else { - process_aligned_fragment(&global_context,global_context.thread_contexts[0],std::pair (cur_read,NULL)); - //cur_reads.push_back(std::pair (cur_read,NULL)); - } - } - alignments_per_read.clear(); - alignments_per_read.push_back(aligned_read); - prev_read_id = qname; - aligned_read = bam_init1(); - } - } - else {//pair end read - main_res.is_pairEnd = true; - size_t flag_pos = qname.find('/'); - //cur_read_id = aligned_read.qname; - if (flag_pos != std::string::npos) {qname = qname.substr(0,flag_pos);} - - if (qname == prev_read_id ){ - if (IS_READ1(aligned_read) ){ multi_read1.push_back(aligned_read);} - if (IS_READ2(aligned_read) ){multi_read2.push_back(aligned_read);} - aligned_read = bam_init1(); - continue; - } - else { - bam1_t * cur_read1 = NULL; - bam1_t * cur_read2 = NULL; - main_res.total_reads += 1; - - //multi-reads - if (multi_read1.size() >1 || multi_read2.size() > 1){ - main_res.multi_mapped_reads += 1; - if (multi_read1.size() > 1 ){ - main_clip_prof.set_qual(multi_read1[0]->core.qual); - } - if (multi_read2.size() > 1 ){ - main_clip_prof.set_qual(multi_read2[0]->core.qual); - } - for(unsigned int i=0;i< multi_read1.size();i++) - { - bam_destroy1(multi_read1[i]); - } - for(unsigned int i=0;i< multi_read2.size();i++) - { - bam_destroy1(multi_read2[i]); - } - } - else { - //uniq read - if (multi_read1.size() == 1) {cur_read1 = multi_read1[0];} - if (multi_read2.size() == 1){ cur_read2 = multi_read2[0];} - - if (cur_read1 && !IS_UNMAPPED(cur_read1)){ - - if ( cur_read1->core.flag & BAM_FQCFAIL || cur_read1->core.qual < q_cut ){ - main_clip_prof.set_qual(cur_read1->core.qual); - bam_destroy1(cur_read1); - cur_read1 = NULL; - main_res.low_qual_read1 += 1; - } - else { - main_res.mapped_read1 += 1; - if (IS_REVERSE(cur_read1)){ main_res.reverse_read += 1;} - else{ main_res.forward_read +=1;} - } - } - else { - if(cur_read1) { bam_destroy1(cur_read1); cur_read1 = NULL;} - if (prev_read_id != ""){ main_res.unmapped_read1 += 1; } - } - - if (cur_read2 && !IS_UNMAPPED(cur_read2)){ - - if (cur_read2->core.flag & BAM_FQCFAIL || cur_read2->core.qual < q_cut){ - main_clip_prof.set_qual(cur_read2->core.qual); - main_res.low_qual_read2 += 1; - bam_destroy1(cur_read2); - cur_read2 = NULL; - } - else { - main_res.mapped_read2 += 1; - if (!cur_read1){ - if (!IS_REVERSE(cur_read2)){ main_res.reverse_read += 1;} - else{ main_res.forward_read +=1;} - } - } - } - else { - if(cur_read2) {bam_destroy1(cur_read2); cur_read2 = NULL;} - if (prev_read_id != ""){ // #not the first fragment - main_res.unmapped_read2 += 1;} - } - - if (cur_read1 || cur_read2 ){ - if(global_context.thread_number > 1){ - //thread_context_t * thread_context = global_context.thread_contexts + current_thread_id; - thread_context_t * thread_context = global_context.thread_contexts[current_thread_id]; - - //std::cout << "before spin lock" << std::endl; - pthread_spin_lock(&thread_context->cur_reads_lock); - //std::cout << "after spin lock" << std::endl; - thread_context->cur_reads->push_back(std::pair(cur_read1,cur_read2)); - - pthread_spin_unlock(&thread_context->cur_reads_lock); - current_thread_id++; - if(current_thread_id >= global_context.thread_number) - current_thread_id = 0; - } - else { - - process_aligned_fragment(&global_context,global_context.thread_contexts[0],std::pair (cur_read1,cur_read2)); - } - } - } - multi_read1.clear(); - multi_read2.clear(); - prev_read_id = qname; - if (IS_READ1(aligned_read)){ - multi_read1.push_back(aligned_read);} - if (IS_READ2(aligned_read)){ - multi_read2.push_back(aligned_read);} - } - - aligned_read = bam_init1(); - } - } - bam_destroy1(aligned_read); - bam_hdr_destroy(header); - sam_close(fp_in); - global_context.all_finished = 1; - //std::cout << "all finished" << std::endl; - // struct arg_struct * args = new arg_struct(); - // args->arg1 = &global_context; - // args->arg2 = global_context.thread_contexts[0]; - //worker((void *)args); - //std::cout << "after all finished" << std::endl; - //join threads - if(global_context.thread_number >1) - { - join_threads(&global_context); - } - merge_results(&global_context,&main_res,&main_clip_prof,&main_cov_prof,&main_inDist_prof); - destroy_thread_context(&global_context); - - - main_res.splice = int(main_res.splice); - main_res.noSplice = int(main_res.noSplice); - - if (main_res.is_pairEnd ){ - main_clip_prof.write(main_res.total_reads*2-main_res.unmapped_read1-main_res.unmapped_read2); - } - else { - main_clip_prof.write(main_res.total_reads); - } - - //main_rDup_prof.write(); - main_inDist_prof.write(); - int zero_exons = main_cov_prof.write(main_res.total_reads); - - main_res.read_dist_plot_file1 = outdir_fig +smp_name+ ".read_distr.png"; - main_res.read_dist_plot_file2 = outdir_fig + smp_name + ".read_distr_pie.png"; - - try { - std::ofstream ROUT; - - ROUT.open (outdir +smp_name+ ".read_distr.r", std::ofstream::out); - //ROUT = fopen(outfile + '.read_distr.r', 'w') - ROUT << "png(\"" << main_res.read_dist_plot_file1 << "\",width=500,height=500,units=\"px\")\n"; - ROUT << "M=c(" << std::to_string(main_res.cds_exon_read) << "," << std::to_string(main_res.utr_5_read) << "," << std::to_string(main_res.utr_3_read) << "," << std::to_string(main_res.intron_read) << "," << std::to_string(main_res.intergenic_up1kb_read) << "," << std::to_string(main_res.intergenic_down1kb_read) << "," << std::to_string(main_res.rRNA_read) << "," << std::to_string(main_res.intergenic_read) << ")\n"; - - ROUT << "Mname=c(\"CDS\",\"5UTR\",\"3UTR\",\"Intron\",\"TSS_Up_1Kb\",\"TES_Down_1Kb\",\"rRNA\",\"Others\")\n"; - ROUT << "val = barplot(M,xlab=\"\",space=1,ylab=\"Read Counts\",col=\"blue\",border=\"NA\")\n"; - ROUT << "text(x=seq(val[1],val[8],by=2),y=rep(0,8),srt=60,adj=0,offset=2,pos=1,xpd=T,labels=Mname)\n"; - ROUT << "dev.state = dev.off()\n"; - ROUT.close(); - - ROUT.open(outdir + smp_name+".read_distr_pie.r", std::ofstream::out); - if (geneIdx->total_exon != 0 ){ - ROUT << "png(\"" << main_res.read_dist_plot_file2 << "\",width=500,height=500,units=\"px\")\n"; - ROUT << "pie(c(" << std::to_string(geneIdx->total_exon-zero_exons) << ',' << std::to_string(zero_exons) << "),labels=c(\"Covered " << (geneIdx->total_exon - zero_exons) << " exons\",\"Uncovered\"),main=\"Exons\",radius=0.6,clockwise=T,col=c(\"blue\",\"white\"))\n"; - ROUT << "dev.state = dev.off()\n"; - } - ROUT.close(); - }catch(std::ofstream::failure e ){ - std::cout << "Error in writing plotting scripts.\n" << std::endl; - } - - main_res.insert_plot_file = main_inDist_prof.InnDist_fig_file; - main_res.insert_file = main_inDist_prof.InnDist_data_file; - - main_res.clipping_plot_file = main_clip_prof.clip_fig_file; - main_res.mapq_plot_file = main_clip_prof.mapq_fig_file; - main_res.mapq_file = main_clip_prof.mapq_data_file; - - //res.read_dup_plot_file = rDup_prof.plot_file - main_res.readLen_plot_file = main_clip_prof.readlen_fig_file; - - main_res.read_cov_plot_file = main_cov_prof.cov_fig_file; - main_res.geneCount_file = main_cov_prof.transcov_data_file; - main_res.trans_cov_plot_file = main_cov_prof.transcov_fig_file; - //res.seqDeDup_percent = rDup_prof.seqDeDup_percent - //res.posDeDup_percent = rDup_prof.posDeDup_percent - return main_res; -} - -int run_qc(char* out_dir, char* outfig_dir,char * ann_file, char* attrID, char* input_file,char* rRNA_file,char* label,int mapq,char* stranded,int thread_num) -{ - std::string gtf_fname (ann_file); - std::string id_attrID (attrID); - //std::string ifile (input_file); - std::string smp_name (label); - std::string strand_info (stranded); - std::string data_outdir (out_dir); - std::string fig_outdir (outfig_dir); - //std::cout << out_dir << std::endl; - //std::cout << smp_name << std::endl; - std::string smp_res_fname = data_outdir+smp_name+".res.txt"; - - //int thread_num = 1; - - std::string rRNA_fname (rRNA_file); - rRNA * rRNAIdx = NULL; - - GeneFeatures * geneIdx = new GeneFeatures(gtf_fname,id_attrID); - //std::cout << "finish gene index build " << std::endl; - if (rRNA_fname != "") { - rRNAIdx = new rRNA(rRNA_fname); - } - //std::cout << "finish index build " << std::endl; - Results res = QC(smp_name,geneIdx,rRNAIdx,input_file,data_outdir,fig_outdir,strand_info,thread_num,mapq); - - res.write(smp_res_fname); - - delete geneIdx; - delete rRNAIdx; - - return 1; -} - - - diff --git a/src/parseBAM.cpp.bak2 b/src/parseBAM.cpp.bak2 deleted file mode 100644 index 2df4284..0000000 --- a/src/parseBAM.cpp.bak2 +++ /dev/null @@ -1,857 +0,0 @@ -// -// parseBAM.cpp -// BAMQC_c++ -// -// Created by Ying Jin on 11/11/15. -// Copyright (c) 2015 Ying Jin. All rights reserved. -// - -#include "parseBAM.h" -#include "Coverage_prof.h" -#include "GeneFeatures.h" -#include "InnerDist_prof.h" -#include "Mappability.h" -#include "Results.h" -#include "rRNA.h" -#include "Constants.h" - -//#include -#include -#include -#include -//#include -#include -#include -#include - -#include "htslib/sam.h" - - - -unsigned int tick_time =3000; - -#define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED) -#define IS_QCFAIL(bam) ((bam)->core.flag & BAM_FQCFAIL) - -#define IS_PAIRED_AND_MAPPED(bam) (((bam)->core.flag&BAM_FPAIRED) && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP)) - -#define IS_PROPERLYPAIRED(bam) (((bam)->core.flag&(BAM_FPAIRED|BAM_FPROPER_PAIR)) == (BAM_FPAIRED|BAM_FPROPER_PAIR) && !((bam)->core.flag&BAM_FUNMAP)) - -#define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP) -#define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE) -#define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE) -#define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1) -#define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2) -#define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP) -//#defind READ_NAME(bam) (bam_get_qname(bam)) - -/*DEF BAM_CMATCH = 0 -DEF BAM_CINS = 1 -DEF BAM_CDEL = 2 -DEF BAM_CREF_SKIP = 3 -DEF BAM_CSOFT_CLIP = 4 -DEF BAM_CHARD_CLIP = 5 -DEF BAM_CPAD = 6 -DEF BAM_CEQUAL = 7 -DEF BAM_CDIFF = 8*/ - -typedef struct -{ - unsigned short thread_id; - pthread_t thread_object; - //pthread_spinlock_t cur_reads_lock; - pthread_spinlock_t cur_reads_lock; - - std::vector > *cur_reads; - - Results * res; - InnerDist_prof * inDist_prof; - Clipping_prof * clip_prof; - Coverage_prof * cov_prof; - -} thread_context_t; - -typedef struct -{ - unsigned short thread_number; - int all_finished; - std::string format; - std::string stranded; - //bam_hdr_t * header; - std::vector refnames; - std::vector thread_contexts; - - GeneFeatures * geneIdx; - rRNA * rRNAIdx; - -} global_context_t; - -struct arg_struct { - global_context_t * arg1; - thread_context_t * arg2; -}; -std::vector > fetch_intron(int st, uint32_t * cigar, uint32_t n_cigar,std::string format) -{ - //''' fetch intron regions defined by cigar. st must be zero based return list of tuple of (st, end)''' - //match = re.compile(r'(\d+)(\D)') - int chrom_st = st; - if (format == "BAM") { chrom_st += 1 ;} - - std::vector > intron_bound ; - - for (unsigned int i=0; i < n_cigar; i++){ //code and size - if (bam_cigar_op(cigar[i])==BAM_CMATCH) { chrom_st += bam_cigar_oplen(cigar[i]);} //match - if (bam_cigar_op(cigar[i])==BAM_CINS) {continue;} //insertion - if (bam_cigar_op(cigar[i])==BAM_CDEL) { chrom_st += bam_cigar_oplen(cigar[i]);} //deletion - if (bam_cigar_op(cigar[i])==BAM_CREF_SKIP) { intron_bound.push_back(std::pair (chrom_st,chrom_st+ bam_cigar_oplen(cigar[i]))); } //gap - if (bam_cigar_op(cigar[i])==BAM_CSOFT_CLIP) { chrom_st += bam_cigar_oplen(cigar[i]);}// soft clipping - } - - return intron_bound ; -} - -std::vector > fetch_exon(int st,uint32_t * cigar,uint32_t n_cigar,std::string format) -{ - //''' fetch exon regions defined by cigar. st must be zero based return list of tuple of (st, end)''' - //match = re.compile(r'(\d+)(\D)') - int chrom_st = st; - if (format == "BAM") { chrom_st += 1;} - - std::vector >exon_bound; - - for (unsigned int i=0; i < n_cigar; i++){ //code and size - if (bam_cigar_op(cigar[i])==BAM_CMATCH) { exon_bound.push_back(std::pair (chrom_st,chrom_st + bam_cigar_oplen(cigar[i])));} //match - if (bam_cigar_op(cigar[i])==BAM_CINS) {continue;} //insertion - if (bam_cigar_op(cigar[i])==BAM_CDEL) { chrom_st += bam_cigar_oplen(cigar[i]);} //deletion - if (bam_cigar_op(cigar[i])==BAM_CREF_SKIP) { chrom_st += bam_cigar_oplen(cigar[i]); } //gap - if (bam_cigar_op(cigar[i])==BAM_CSOFT_CLIP) { chrom_st += bam_cigar_oplen(cigar[i]);}// soft clipping - } - return exon_bound; -} - -//return flag and gene id -std::pair ovp_gene(GeneFeatures * geneIdx,std::string chrom1,std::vector >exon_blocks1,std::string chrom2,std::vector > exon_blocks2,std::string strand,std::vector * mapped_exons) -{ - - std::vector res1 = geneIdx->Gene_annotation(chrom1,exon_blocks1,strand,mapped_exons); - std::vector res2 = geneIdx->Gene_annotation(chrom2,exon_blocks2,strand,mapped_exons); - - //intersect genes1 and genes2 - if (res1.size() > 1 && res2.size() > 1 ) - { - int type1 = res1[0]; - int type2 = res2[0]; - - if (res1.size() == 2 && res2.size() == 2){ - if (res1[1] == res2[1]){ - if(type1 < type2) {return std::pair (type1,res1[1]);} - else { return std::pair (type2,res1[1]); } - } - else { return std::pair (-1,res1[1]);} //ambiguous - - } - if (res1.size() == 2 && res2.size() > 2) { - - return std::pair (type1,res1[1]); - } - if (res2.size() == 2 && res1.size() >2) { - return std::pair (type2,res2[1]); - } - if (res2.size() > 2 && res1.size() > 2) { - if(type1 < type2) { return std::pair (type1,-1);} - else { return std::pair (type2,-1); } - } - - } - if (res1.size() ==2 && res2.size() ==1 ) - { return std::pair (res1[0],res1[1]);} - - if (res2.size() == 2 && res1.size() == 1) - { return std::pair (res2[0],res2[1]); } - - return std::pair (-1,-1); - -} - -void process_aligned_fragment(global_context_t * global_context,thread_context_t * thread_context,std::pair read_pair) -{ - - //std::cout << "start process" << std::endl; - bam1_t * cur_read1 = read_pair.first; - bam1_t * cur_read2 = read_pair.second; - std::string strand1 = "."; - std::string strand2 = "."; - int chrom1_id = -1; - int chrom2_id = -1; - std::vector > exons1 ; - std::vector > exons2 ; - std::vector > intron_blocks1 ; - std::vector > intron_blocks2 ; - std::string qname = ""; - std::string chrom1 = ""; - std::string chrom2 = ""; - - if(cur_read1 != NULL ) { - if(!IS_UNMAPPED(cur_read1)) - { - uint32_t *cigar = bam_get_cigar(cur_read1); - // char * name = bam_get_qname(cur_read1); - // printf("read name %s \n", name); - thread_context-> clip_prof->set(cur_read1->core.l_qseq,cur_read1->core.n_cigar,cigar,cur_read1->core.qual); - // printf("after clip prof set %s \n", name); - chrom1_id = cur_read1->core.tid; - //p1 = global_context->header->target_name[cur_read1->core.tid]; - - //qname = std::string(name); - //std::cout << "chromID " << chrom1_id << std::endl; - if((size_t)cur_read1->core.tid < global_context->refnames.size()){ - chrom1 = global_context->refnames[cur_read1->core.tid]; - } - else { - std::cout <<"missing reference sequences." << std::endl; - std::exit(1); - } - - //std::cout << chrom1 << std::endl; - strand1 = IS_REVERSE(cur_read1) ? "-" : "+"; - - if (global_context->stranded =="reverse") { - strand1 = (strand1 == "+") ? "-" : "+"; - } - exons1 = fetch_exon(cur_read1->core.pos, cigar,cur_read1->core.n_cigar, global_context->format); - intron_blocks1 = fetch_intron(cur_read1->core.pos,cigar,cur_read1->core.n_cigar,global_context->format); - } - } - if(cur_read2 != NULL){ - if(! IS_UNMAPPED(cur_read2)){ - uint32_t *cigar = bam_get_cigar(cur_read2); - // char * name = bam_get_qname(cur_read2); - // printf("read name %s \n", name); - thread_context->clip_prof->set(cur_read2->core.l_qseq,cur_read2->core.n_cigar,cigar,cur_read2->core.qual); - strand2 = IS_REVERSE(cur_read2) ? "-" : "+"; - - if (global_context-> stranded =="reverse") { - strand2 = ( strand2 == "+") ? "-" : "+"; - } - chrom2_id = cur_read2->core.tid; - if((size_t)cur_read2->core.tid < global_context->refnames.size()){ - chrom2 = global_context->refnames[cur_read2->core.tid]; - } - else { - std::cout <<"missing reference sequences." << std::endl; - std::exit(1); - } - exons2 = fetch_exon(cur_read2->core.pos, cigar,cur_read2->core.n_cigar, global_context->format); - intron_blocks2 = fetch_intron(cur_read2->core.pos,cigar,cur_read2->core.n_cigar,global_context->format); - } - } - - if (intron_blocks1.size() + intron_blocks2.size() == 0){ - thread_context->res->noSplice += 1; - } - else { - thread_context->res->splice += 1; - } - //paired read - if (chrom1_id != -1 && chrom2_id != -1){ - thread_context->res->paired_reads += 1; - - if (chrom1_id != chrom2_id ){ - thread_context->res->paired_diff_chrom += 1; - } - if (strand1 == "-" && strand2 == "-" ){ - thread_context->res->mapped_minus_minus += 1; - } - if (strand1 == "+" && strand2 == "+" ){ - thread_context->res->mapped_plus_plus += 1; - } - if (strand1 == "+" && strand2 == "-" ){ - thread_context->res->mapped_plus_minus += 1; - } - if (strand1 == "-" && strand2 == "+"){ - thread_context->res->mapped_minus_plus += 1; - } - } - if (global_context->stranded == "no"){ - strand1 = "."; - strand2 = "."; - } - - if (cur_read1 == NULL || IS_UNMAPPED(cur_read1)) { - strand1 = (strand2 == "-") ? "+" : "-"; - } - //mapped read len - //rRNA read - //std::cout << "before is rRNA" << std::endl; - if (global_context->rRNAIdx != NULL){ - if (global_context->rRNAIdx->is_rRNA(chrom1,exons1,strand1) || global_context->rRNAIdx->is_rRNA(chrom2,exons2,strand1)){ - thread_context->res->rRNA_read += 1; - return; - } - } -// std::cout << "after is rRNA" << std::endl; - //cur_time = time.time() - std::vector * mapped_exons = new std::vector(); - - std::pair ovp = ovp_gene(global_context->geneIdx,chrom1,exons1,chrom2,exons2,strand1,mapped_exons); - -// std::cout << "after ovp gene" << std::endl; - thread_context->cov_prof->count(ovp.second,exons1,exons2,*mapped_exons); -// std::cout << "after cov prof" << std::endl; - switch(ovp.first) - { - case CDS: - thread_context->res->cds_exon_read +=1; - break; - case UTR5: - thread_context->res->utr_5_read +=1; - break; - case UTR3: - thread_context->res->utr_3_read +=1; - break; - case INTRON: - thread_context->res->intron_read +=1; - break; - case ITGUP1K: - thread_context->res->intergenic_up1kb_read +=1; - break; - case ITGDN1K: - thread_context->res->intergenic_down1kb_read +=1; - break; - default: - thread_context->res->intergenic_read +=1; - break; - } - if(chrom1 != "" && chrom2 != ""){ - if( IS_PROPERLYPAIRED(cur_read1)){ - if (strand1 == "+" || strand1 == "."){ - if (global_context->stranded == "reverse") { - thread_context->inDist_prof->count(global_context->geneIdx,ovp.first,cur_read2,chrom2,intron_blocks2,strand1); - } - else{ - thread_context->inDist_prof->count(global_context->geneIdx,ovp.first,cur_read1,chrom1,intron_blocks1,strand1); - } - } - else { - if (global_context->stranded == "reverse" ){ - thread_context->inDist_prof->count(global_context->geneIdx,ovp.first,cur_read1,chrom1,intron_blocks1,strand1); - } - else { - thread_context->inDist_prof->count(global_context->geneIdx,ovp.first,cur_read2,chrom2,intron_blocks2,strand1); - } - } - } - } -// std::cout << "before delete mapped exon" << std::endl; - delete mapped_exons; - if(cur_read1 != NULL){ - bam_destroy1(cur_read1); - } - if(cur_read2 != NULL){ - bam_destroy1(cur_read2); - } -} - -void* worker(void * vargs) -{ - struct arg_struct * args = (struct arg_struct *) vargs; - thread_context_t * thread_context = args->arg2; - global_context_t * global_context = args->arg1; - delete args; - - while (1){ - std::pair cur_read_pair; - while(1){ - int is_retrieved = 0; - //std::cout << "start worker" << std::endl; - //std::cout << "num of threads " << global_context->thread_contexts.size() << std::endl; - //std::cout << "thread id " << thread_context->thread_id << std::endl; - pthread_spin_lock(&thread_context->cur_reads_lock); - - if(thread_context->cur_reads->size()>0){ - cur_read_pair = thread_context->cur_reads->back(); - thread_context->cur_reads->pop_back(); - - is_retrieved = 1; - // std::cout << "in worker retrieved" << std::endl; - } - pthread_spin_unlock(&thread_context->cur_reads_lock); - if(global_context->all_finished && !is_retrieved) return NULL; - - if(is_retrieved) break; - else usleep(tick_time); - } - process_aligned_fragment(global_context,thread_context,cur_read_pair); - - } -} - - -//start threads -void init_thread(global_context_t * global_context,unsigned short threadNumber,int mapq) -{ - global_context->thread_number = threadNumber; - //global_context -> thread_contexts = (thread_context_t * ) malloc(sizeof(thread_context_t) * global_context -> thread_number); - if(threadNumber >1){ - for(int i=0;icur_reads_lock, PTHREAD_PROCESS_PRIVATE); - th_contx->thread_id = i; - //std::cout << "spin lock init " << i << std::endl; - th_contx->cur_reads = new std::vector >() ; - - th_contx->res = new Results(); - th_contx->clip_prof = new Clipping_prof(mapq); - th_contx->cov_prof = new Coverage_prof(global_context->geneIdx); - th_contx->inDist_prof = new InnerDist_prof(); - global_context->thread_contexts.push_back(th_contx); - } - - for(int i=0;iarg1 = global_context; - args->arg2 = global_context->thread_contexts[i]; - //int ret = pthread_create(&global_context -> thread_contexts[i].thread_object, NULL, worker, (void *)&args); - pthread_create(&global_context -> thread_contexts[i]->thread_object, NULL, worker, (void *)args); - } - } - else { - thread_context_t *th_contx = new thread_context_t(); - //std::cout << "spin lock init " << i << std::endl; - th_contx->cur_reads = new std::vector >() ; - - th_contx->res = new Results(); - th_contx->clip_prof = new Clipping_prof(mapq); - th_contx->cov_prof = new Coverage_prof(global_context->geneIdx); - th_contx->inDist_prof = new InnerDist_prof(); - global_context->thread_contexts.push_back(th_contx); - - } - - return; -} - -//release memory -void destroy_thread_context(global_context_t * global_context) -{ - - for(int i=0; i < global_context-> thread_number; i++) - { - delete global_context->thread_contexts[i]->res; - delete global_context->thread_contexts[i]->cov_prof; - delete global_context->thread_contexts[i]->clip_prof; - delete global_context->thread_contexts[i]->cur_reads; - delete global_context->thread_contexts[i]->inDist_prof; - if(global_context->thread_number >1){ - pthread_spin_destroy(&global_context -> thread_contexts[i]->cur_reads_lock); - } - delete global_context -> thread_contexts[i]; - } - -} - -//join threads -void join_threads(global_context_t * global_context) -{ - int xk1; - for(xk1=0; xk1 thread_number; xk1++) - pthread_join(global_context -> thread_contexts[xk1]->thread_object, NULL); -} - -//merge results -void merge_results(global_context_t * global_context,Results * res,Clipping_prof * clip_prof,Coverage_prof * cov_prof, InnerDist_prof * inDist_prof) -{ - for(int i=0;ithread_number;i++) - { - res->add(global_context->thread_contexts[i]->res); - clip_prof->add(global_context->thread_contexts[i]->clip_prof); - cov_prof->add(global_context->thread_contexts[i]->cov_prof); - inDist_prof->add(global_context->thread_contexts[i]->inDist_prof); - } -} - -// parsing BAM file -Results QC(std::string smp_name,GeneFeatures * geneIdx, rRNA * rRNAIdx,char * inputFile,std::string outdir,std::string outdir_fig,std::string stranded,int threadNumber,int mapq) -{ - //'''This class provides fuctions to parsing SAM or BAM files and quality controls.''' - //'''constructor. input could be bam or sam''' - - //int max_mapQ = 0; - int q_cut = mapq; - //qc result - global_context_t global_context; - global_context.stranded = stranded; - global_context.all_finished = 0; - - //multi-thread - //pthread_t thread_object; - //threads = [] - //open SAM/BAM file - samFile *fp_in = NULL; - bam1_t * aligned_read = NULL; - //bam_header_t *bh = NULL; - bam_hdr_t *header = NULL; - - std::string format = "BAM"; - - fp_in = sam_open(inputFile, "rb"); - - if(NULL == fp_in) { - std::cout << "Could not open file " << inputFile << std::endl; - std::exit(1); - } - if(hts_get_format(fp_in)->format == sam) {format = "SAM";} - global_context.format = format; - - aligned_read = bam_init1(); - - Results main_res; - //read header - header = sam_hdr_read(fp_in); - if(NULL == header){ - std::cout << "No header information." << std::endl; - std::exit(1); - } - for(int i=0;i< header->n_targets;i++) - { - char * p = header->target_name[i]; - std::string chr(p); - // std::cout << chr << std::endl; - global_context.refnames.push_back(chr); - } - //return main_res; - - std::string prev_read_id = ""; - std::vector multi_read1; - std::vector multi_read2; - - std::vector alignments_per_read; - - - Clipping_prof main_clip_prof(outdir+smp_name,outdir_fig+smp_name,mapq); - Coverage_prof main_cov_prof(outdir+smp_name,outdir_fig+smp_name,geneIdx); - //ReadDup_prof main_rDup_prof(outdir,outdir_fig); - InnerDist_prof main_inDist_prof(outdir+smp_name,outdir_fig+smp_name); - - // int lineno = 0; - global_context.geneIdx = geneIdx; - global_context.rRNAIdx = rRNAIdx; - init_thread(&global_context, threadNumber,mapq); - //std::cout << "global context thread number" << global_context.thread_contexts.size()<< std::endl; - //std::cout << "start read file" << std::endl; - - int current_thread_id = 0; - while(sam_read1(fp_in, header,aligned_read) > 0) - { - std::string qname =""; - char * name = bam_get_qname(aligned_read); - if(name !=NULL) - { - qname = std::string(name); - } - //SE reads - if (! IS_PAIRED(aligned_read)) - { - //count unmapped read - if (IS_UNMAPPED(aligned_read)) - { - main_res.unmapped_reads += 1; - main_res.total_reads += 1; - bam_destroy1(aligned_read); - aligned_read = bam_init1(); - continue; - } - - if (qname == prev_read_id || prev_read_id == "") { - alignments_per_read.push_back(aligned_read); - prev_read_id = qname; - aligned_read = bam_init1(); - continue; - } - else { - bam1_t *cur_read = NULL; - if (alignments_per_read.size() == 1){ //unique read - main_res.uniq_mapped_reads += 1; - main_res.total_reads += 1; - - cur_read = alignments_per_read[0]; - - int skip_read = 0; - - if (cur_read->core.flag & BAM_FQCFAIL || cur_read->core.qual < q_cut){ //skip QC fail read - main_clip_prof.set_qual(cur_read->core.qual); - main_res.low_qual += 1; - skip_read = 1; - } - if (IS_DUP(cur_read)){ //skip duplicate read - skip_read = 1; - } - if (skip_read == 1) { - bam_destroy1(cur_read); - cur_read = NULL; - } - } - else { //multi reads - main_res.multi_mapped_reads += 1; - main_res.total_reads += 1; - cur_read = alignments_per_read[0]; - main_clip_prof.set_qual(cur_read->core.qual); - for(unsigned int i=0;i1){ - //thread_context_t * thread_context = global_context.thread_contexts + current_thread_id; - thread_context_t * thread_context = global_context.thread_contexts[current_thread_id]; - pthread_spin_lock(&thread_context->cur_reads_lock); - thread_context->cur_reads->push_back(std::pair(cur_read,NULL)); - pthread_spin_unlock(&thread_context->cur_reads_lock); - current_thread_id++; - if(current_thread_id >= global_context.thread_number) current_thread_id = 0; - } - else { - process_aligned_fragment(&global_context,global_context.thread_contexts[0],std::pair (cur_read,NULL)); - //cur_reads.push_back(std::pair (cur_read,NULL)); - } - } - alignments_per_read.clear(); - alignments_per_read.push_back(aligned_read); - prev_read_id = qname; - aligned_read = bam_init1(); - } - } - else {//pair end read - main_res.is_pairEnd = true; - size_t flag_pos = qname.find('/'); - //cur_read_id = aligned_read.qname; - if (flag_pos != std::string::npos) {qname = qname.substr(0,flag_pos);} - - if (qname == prev_read_id ){ - if (IS_READ1(aligned_read) ){ multi_read1.push_back(aligned_read);} - if (IS_READ2(aligned_read) ){multi_read2.push_back(aligned_read);} - aligned_read = bam_init1(); - continue; - } - else { - bam1_t * cur_read1 = NULL; - bam1_t * cur_read2 = NULL; - main_res.total_reads += 1; - - //multi-reads - if (multi_read1.size() >1 || multi_read2.size() > 1){ - main_res.multi_mapped_reads += 1; - if (multi_read1.size() > 1 ){ - main_clip_prof.set_qual(multi_read1[0]->core.qual); - } - if (multi_read2.size() > 1 ){ - main_clip_prof.set_qual(multi_read2[0]->core.qual); - } - for(unsigned int i=0;i< multi_read1.size();i++) - { - bam_destroy1(multi_read1[i]); - } - for(unsigned int i=0;i< multi_read2.size();i++) - { - bam_destroy1(multi_read2[i]); - } - } - else { - //uniq read - if (multi_read1.size() == 1) {cur_read1 = multi_read1[0];} - if (multi_read2.size() == 1){ cur_read2 = multi_read2[0];} - - if (cur_read1 && !IS_UNMAPPED(cur_read1)){ - - if ( cur_read1->core.flag & BAM_FQCFAIL || cur_read1->core.qual < q_cut ){ - main_clip_prof.set_qual(cur_read1->core.qual); - bam_destroy1(cur_read1); - cur_read1 = NULL; - main_res.low_qual_read1 += 1; - } - else { - main_res.mapped_read1 += 1; - if (IS_REVERSE(cur_read1)){ main_res.reverse_read += 1;} - else{ main_res.forward_read +=1;} - } - } - else { - if(cur_read1) { bam_destroy1(cur_read1); cur_read1 = NULL;} - if (prev_read_id != ""){ main_res.unmapped_read1 += 1; } - } - - if (cur_read2 && !IS_UNMAPPED(cur_read2)){ - - if (cur_read2->core.flag & BAM_FQCFAIL || cur_read2->core.qual < q_cut){ - main_clip_prof.set_qual(cur_read2->core.qual); - main_res.low_qual_read2 += 1; - bam_destroy1(cur_read2); - cur_read2 = NULL; - } - else { - main_res.mapped_read2 += 1; - if (!cur_read1){ - if (!IS_REVERSE(cur_read2)){ main_res.reverse_read += 1;} - else{ main_res.forward_read +=1;} - } - } - } - else { - if(cur_read2) {bam_destroy1(cur_read2); cur_read2 = NULL;} - if (prev_read_id != ""){ // #not the first fragment - main_res.unmapped_read2 += 1;} - } - - if (cur_read1 || cur_read2 ){ - if(global_context.thread_number > 1){ - //thread_context_t * thread_context = global_context.thread_contexts + current_thread_id; - thread_context_t * thread_context = global_context.thread_contexts[current_thread_id]; - - //std::cout << "before spin lock" << std::endl; - pthread_spin_lock(&thread_context->cur_reads_lock); - //std::cout << "after spin lock" << std::endl; - thread_context->cur_reads->push_back(std::pair(cur_read1,cur_read2)); - - pthread_spin_unlock(&thread_context->cur_reads_lock); - current_thread_id++; - if(current_thread_id >= global_context.thread_number) - current_thread_id = 0; - } - else { - - process_aligned_fragment(&global_context,global_context.thread_contexts[0],std::pair (cur_read1,cur_read2)); - } - } - } - multi_read1.clear(); - multi_read2.clear(); - prev_read_id = qname; - if (IS_READ1(aligned_read)){ - multi_read1.push_back(aligned_read);} - if (IS_READ2(aligned_read)){ - multi_read2.push_back(aligned_read);} - } - - aligned_read = bam_init1(); - } - } - bam_destroy1(aligned_read); - bam_hdr_destroy(header); - sam_close(fp_in); - global_context.all_finished = 1; - //std::cout << "all finished" << std::endl; - // struct arg_struct * args = new arg_struct(); - // args->arg1 = &global_context; - // args->arg2 = global_context.thread_contexts[0]; - //worker((void *)args); - //std::cout << "after all finished" << std::endl; - //join threads - if(global_context.thread_number >1) - { - join_threads(&global_context); - } - merge_results(&global_context,&main_res,&main_clip_prof,&main_cov_prof,&main_inDist_prof); - destroy_thread_context(&global_context); - - - main_res.splice = int(main_res.splice); - main_res.noSplice = int(main_res.noSplice); - - if (main_res.is_pairEnd ){ - main_clip_prof.write(main_res.total_reads*2-main_res.unmapped_read1-main_res.unmapped_read2); - } - else { - main_clip_prof.write(main_res.total_reads); - } - - //main_rDup_prof.write(); - main_inDist_prof.write(); - int zero_exons = main_cov_prof.write(main_res.total_reads); - - main_res.read_dist_plot_file1 = outdir_fig +smp_name+ ".read_distr.png"; - main_res.read_dist_plot_file2 = outdir_fig + smp_name + ".read_distr_pie.png"; - - try { - std::ofstream ROUT; - - ROUT.open (outdir +smp_name+ ".read_distr.r", std::ofstream::out); - //ROUT = fopen(outfile + '.read_distr.r', 'w') - ROUT << "png(\"" << main_res.read_dist_plot_file1 << "\",width=500,height=500,units=\"px\")\n"; - ROUT << "M=c(" << std::to_string(main_res.cds_exon_read) << "," << std::to_string(main_res.utr_5_read) << "," << std::to_string(main_res.utr_3_read) << "," << std::to_string(main_res.intron_read) << "," << std::to_string(main_res.intergenic_up1kb_read) << "," << std::to_string(main_res.intergenic_down1kb_read) << "," << std::to_string(main_res.rRNA_read) << "," << std::to_string(main_res.intergenic_read) << ")\n"; - - ROUT << "Mname=c(\"CDS\",\"5UTR\",\"3UTR\",\"Intron\",\"TSS_Up_1Kb\",\"TES_Down_1Kb\",\"rRNA\",\"Others\")\n"; - ROUT << "val = barplot(M,xlab=\"\",space=1,ylab=\"Read Counts\",col=\"blue\",border=\"NA\")\n"; - ROUT << "text(x=seq(val[1],val[8],by=2),y=rep(0,8),srt=60,adj=0,offset=2,pos=1,xpd=T,labels=Mname)\n"; - ROUT << "dev.state = dev.off()\n"; - ROUT.close(); - - ROUT.open(outdir + smp_name+".read_distr_pie.r", std::ofstream::out); - if (geneIdx->total_exon != 0 ){ - ROUT << "png(\"" << main_res.read_dist_plot_file2 << "\",width=500,height=500,units=\"px\")\n"; - ROUT << "pie(c(" << std::to_string(geneIdx->total_exon-zero_exons) << ',' << std::to_string(zero_exons) << "),labels=c(\"Covered " << (geneIdx->total_exon - zero_exons) << " exons\",\"Uncovered\"),main=\"Exons\",radius=0.6,clockwise=T,col=c(\"blue\",\"white\"))\n"; - ROUT << "dev.state = dev.off()\n"; - } - ROUT.close(); - }catch(std::ofstream::failure e ){ - std::cout << "Error in writing plotting scripts.\n" << std::endl; - } - - main_res.insert_plot_file = main_inDist_prof.InnDist_fig_file; - main_res.insert_file = main_inDist_prof.InnDist_data_file; - - main_res.clipping_plot_file = main_clip_prof.clip_fig_file; - main_res.mapq_plot_file = main_clip_prof.mapq_fig_file; - main_res.mapq_file = main_clip_prof.mapq_data_file; - - //res.read_dup_plot_file = rDup_prof.plot_file - main_res.readLen_plot_file = main_clip_prof.readlen_fig_file; - - main_res.read_cov_plot_file = main_cov_prof.cov_fig_file; - main_res.geneCount_file = main_cov_prof.transcov_data_file; - main_res.trans_cov_plot_file = main_cov_prof.transcov_fig_file; - //res.seqDeDup_percent = rDup_prof.seqDeDup_percent - //res.posDeDup_percent = rDup_prof.posDeDup_percent - return main_res; -} - -int run_qc(char* out_dir, char* outfig_dir,char * ann_file, char* attrID, char* input_file,char* rRNA_file,char* label,int mapq,char* stranded,int thread_num) -{ - std::string gtf_fname (ann_file); - std::string id_attrID (attrID); - //std::string ifile (input_file); - std::string smp_name (label); - std::string strand_info (stranded); - std::string data_outdir (out_dir); - std::string fig_outdir (outfig_dir); - //std::cout << out_dir << std::endl; - //std::cout << smp_name << std::endl; - std::string smp_res_fname = data_outdir+smp_name+".res.txt"; - - //int thread_num = 1; - - std::string rRNA_fname (rRNA_file); - rRNA * rRNAIdx = NULL; - - GeneFeatures * geneIdx = new GeneFeatures(gtf_fname,id_attrID); - //std::cout << "finish gene index build " << std::endl; - if (rRNA_fname != "") { - rRNAIdx = new rRNA(rRNA_fname); - } - //std::cout << "finish index build " << std::endl; - Results res = QC(smp_name,geneIdx,rRNAIdx,input_file,data_outdir,fig_outdir,strand_info,thread_num,mapq); - - res.write(smp_res_fname); - - delete geneIdx; - delete rRNAIdx; - - return 1; -} - - -