Skip to content

Commit

Permalink
feat: added per-read flag and map quality filtering
Browse files Browse the repository at this point in the history
 - closes #183, flag based filtering
 - closes #189, mapping quality based filtering
 - option '-F' filters reads containing any of these flags
 - option '-f' filters reads not containing all these flags
 - option '-q' filters reads below this mapping quality
  • Loading branch information
TimD1 committed Sep 19, 2024
1 parent 1260bf9 commit 409a492
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 6 deletions.
18 changes: 16 additions & 2 deletions src/cis-splice-effects/cis_splice_effects_identifier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ void CisSpliceEffectsIdentifier::usage(ostream& out) {
<< "\t\t\t " << "anchor length on both sides are reported. [8]" << endl;
out << "\t\t" << "-m INT\tMinimum intron length. [70]" << endl;
out << "\t\t" << "-M INT\tMaximum intron length. [500000]" << endl;
out << "\t\t" << "-f INT\tOnly use alignments where all flag bits set here are set. [0]" << endl;
out << "\t\t" << "-F INT\tOnly use alignments where no flag bits set here are set. [0]" << endl;
out << "\t\t" << "-q INT\tOnly use alignments with this mapping quality or above. [0]" << endl;
out << "\t\t" << "-w INT\tWindow size in b.p to identify splicing events in.\n"
<< "\t\t\t " << "The tool identifies events in variant.start +/- w basepairs.\n"
<< "\t\t\t " << "Default behaviour is to look at the window between previous and next exons." << endl;
Expand Down Expand Up @@ -113,7 +116,7 @@ void CisSpliceEffectsIdentifier::parse_options(int argc, char* argv[]) {
optind = 1; //Reset before parsing again.
stringstream help_ss;
char c;
while((c = getopt(argc, argv, "o:w:v:j:e:Ei:ISht:s:a:m:M:b:C")) != -1) {
while((c = getopt(argc, argv, "o:w:v:j:e:Ei:ISht:s:a:m:M:f:F:q:b:C")) != -1) {
switch(c) {
case 'o':
output_file_ = string(optarg);
Expand Down Expand Up @@ -170,6 +173,15 @@ void CisSpliceEffectsIdentifier::parse_options(int argc, char* argv[]) {
case 'M':
max_intron_length_ = atoi(optarg);
break;
case 'f':
require_flags_ = atoi(optarg);
break;
case 'F':
filter_flags_ = atoi(optarg);
break;
case 'q':
min_map_qual_ = atoi(optarg);
break;
case 'b':
output_barcodes_file_ = string(optarg);
break;
Expand Down Expand Up @@ -285,7 +297,9 @@ void CisSpliceEffectsIdentifier::identify() {
} else {
ref_to_pass = "NA";
}
JunctionsExtractor je1(bam_, variant_region, strandness_, strand_tag_, min_anchor_length_, min_intron_length_, max_intron_length_, ref_to_pass);
JunctionsExtractor je1(bam_, variant_region, strandness_,
strand_tag_, min_anchor_length_, min_intron_length_, max_intron_length_,
filter_flags_, require_flags_, min_map_qual_, ref_to_pass);
je1.identify_junctions_from_BAM();
vector<Junction> junctions = je1.get_all_junctions();
//Add all the junctions to the unique set
Expand Down
11 changes: 10 additions & 1 deletion src/cis-splice-effects/cis_splice_effects_identifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,19 @@ class CisSpliceEffectsIdentifier {
//tag used in BAM to denote strand, default "XS"
string strand_tag_;
//Minimum anchor length for junctions
//Junctions need atleast this many bp overlap
//Junctions need at least this many bp overlap
// on both ends.
uint32_t min_anchor_length_;
//Minimum length of an intron, i.e min junction width
uint32_t min_intron_length_;
//Maximum length of an intron, i.e max junction width
uint32_t max_intron_length_;
//filter reads containing any of these flags
uint16_t filter_flags_;
// filter reads not containing all of these flags
uint16_t require_flags_;
// filter reads below the minimum mapping quality
uint8_t min_map_qual_;
//whether to override strand of extracted junctions using intron-motif method
bool override_strand_with_canonical_intron_motif_;
public:
Expand All @@ -114,6 +120,9 @@ class CisSpliceEffectsIdentifier {
min_anchor_length_(8),
min_intron_length_(70),
max_intron_length_(500000),
filter_flags_(0),
require_flags_(0),
min_map_qual_(0),
override_strand_with_canonical_intron_motif_(false) {}
//Destructor
~CisSpliceEffectsIdentifier() {
Expand Down
32 changes: 30 additions & 2 deletions src/junctions/junctions_extractor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) {
optind = 1; //Reset before parsing again.
int c;
stringstream help_ss;
while((c = getopt(argc, argv, "ha:m:M:o:r:t:s:b:")) != -1) {
while((c = getopt(argc, argv, "ha:m:M:f:F:q:o:r:t:s:b:")) != -1) {
switch(c) {
case 'h':
usage(help_ss);
Expand All @@ -57,6 +57,15 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) {
case 'M':
max_intron_length_ = atoi(optarg);
break;
case 'f':
require_flags_ = atoi(optarg);
break;
case 'F':
filter_flags_ = atoi(optarg);
break;
case 'q':
min_map_qual_ = atoi(optarg);
break;
case 'o':
output_file_ = string(optarg);
break;
Expand Down Expand Up @@ -108,10 +117,17 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) {
throw runtime_error("Strandness mode 'intron-motif' requires a fasta file!\n\n");
}
}
if ( (require_flags_ & filter_flags_) != 0) {
usage();
throw runtime_error("No overlap allowed between '-f' and '-F' options (same flag filtered and required)!\n\n");
}

cerr << "Minimum junction anchor length: " << min_anchor_length_ << endl;
cerr << "Minimum intron length: " << min_intron_length_ << endl;
cerr << "Maximum intron length: " << max_intron_length_ << endl;
cerr << "Require alignment flags: " << require_flags_ << endl;
cerr << "Filter alignment flags: " << filter_flags_ << endl;
cerr << "Minimum alignment mapping quality: " << int(min_map_qual_) << endl;
cerr << "Alignment: " << bam_ << endl;
cerr << "Output file: " << output_file_ << endl;
if (output_barcodes_file_ != "NA"){
Expand All @@ -130,6 +146,9 @@ int JunctionsExtractor::usage(ostream& out) {
<< "\t\t\t " << "anchor length on both sides are reported. [8]" << endl;
out << "\t\t" << "-m INT\tMinimum intron length. [70]" << endl;
out << "\t\t" << "-M INT\tMaximum intron length. [500000]" << endl;
out << "\t\t" << "-f INT\tOnly use alignments where all flag bits set here are set. [0]" << endl;
out << "\t\t" << "-F INT\tOnly use alignments where no flag bits set here are set. [0]" << endl;
out << "\t\t" << "-q INT\tOnly use alignments with this mapping quality or above. [0]" << endl;
out << "\t\t" << "-o FILE\tThe file to write output to. [STDOUT]" << endl;
out << "\t\t" << "-r STR\tThe region to identify junctions \n"
<< "\t\t\t " << "in \"chr:start-end\" format. Entire BAM by default." << endl;
Expand Down Expand Up @@ -358,7 +377,7 @@ void JunctionsExtractor::set_junction_strand(bam1_t *aln, Junction& j1, string i
return;
}

//Get the the barcode
//Get the barcode
void JunctionsExtractor::set_junction_barcode(bam1_t *aln, Junction& j1) {
uint8_t *p = bam_aux_get(aln, barcode_tag_.c_str());
if(p != NULL) {
Expand All @@ -373,6 +392,14 @@ void JunctionsExtractor::set_junction_barcode(bam1_t *aln, Junction& j1) {
}
}

//Filters alignments based on filtering flags and mapping quality
bool JunctionsExtractor::filter_alignment(bam_hdr_t *header, bam1_t *aln) {
if ((aln->core.flag & filter_flags_) != 0) return true;
if ((aln->core.flag | require_flags_) != aln->core.flag) return true;
if (aln->core.qual < min_map_qual_) return true;
return false;
}

//Parse junctions from the read and store in junction map
int JunctionsExtractor::parse_alignment_into_junctions(bam_hdr_t *header, bam1_t *aln) {
int n_cigar = aln->core.n_cigar;
Expand Down Expand Up @@ -523,6 +550,7 @@ int JunctionsExtractor::identify_junctions_from_BAM() {
//Initiate the alignment record
bam1_t *aln = bam_init1();
while(sam_itr_next(in, iter, aln) >= 0) {
if (filter_alignment(header, aln)) continue;
parse_alignment_into_junctions(header, aln);
}
hts_itr_destroy(iter);
Expand Down
19 changes: 18 additions & 1 deletion src/junctions/junctions_extractor.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,12 +180,21 @@ class JunctionsExtractor {
string strand_tag_;
//tag used in BAM to denote single cell barcode
string barcode_tag_;
//filter reads containing any of these flags
uint16_t filter_flags_;
// filter reads not containing all of these flags
uint16_t require_flags_;
// filter reads below the minimum mapping quality
uint8_t min_map_qual_;
public:
//Default constructor
JunctionsExtractor() {
min_anchor_length_ = 8;
min_intron_length_ = 70;
max_intron_length_ = 500000;
filter_flags_ = 0;
require_flags_ = 0;
min_map_qual_ = 0;
junctions_sorted_ = false;
strandness_ = -1;
strand_tag_ = "XS";
Expand All @@ -204,14 +213,20 @@ class JunctionsExtractor {
uint32_t min_anchor_length1,
uint32_t min_intron_length1,
uint32_t max_intron_length1,
uint16_t filter_flags,
uint16_t require_flags,
uint8_t min_map_qual,
string ref1) :
bam_(bam1),
region_(region1),
strandness_(strandness1),
strand_tag_(strand_tag1),
min_anchor_length_(min_anchor_length1),
min_intron_length_(min_intron_length1),
min_intron_length_(min_anchor_length1),
max_intron_length_(max_intron_length1),
filter_flags_(filter_flags),
require_flags_(require_flags),
min_map_qual_(min_map_qual),
ref_(ref1) {
junctions_sorted_ = false;
output_file_ = "NA";
Expand Down Expand Up @@ -241,6 +256,8 @@ class JunctionsExtractor {
void create_junctions_vector();
//Pull out the cigar string from the read
int parse_read(bam_hdr_t *header, bam1_t *aln);
//Returns whether alignment should be filtered from junction analysis
bool filter_alignment(bam_hdr_t *header, bam1_t *aln);
//Parse junctions from the read and store in junction map
int parse_cigar_into_junctions(string chr, int read_pos,
uint32_t *cigar, int n_cigar);
Expand Down

0 comments on commit 409a492

Please sign in to comment.