Skip to content

Commit

Permalink
merge candidate regions [iqbal-lab-org#228]
Browse files Browse the repository at this point in the history
  • Loading branch information
mbhall88 committed Sep 23, 2020
1 parent 97742e5 commit 5e1c0f9
Show file tree
Hide file tree
Showing 6 changed files with 251 additions and 31 deletions.
6 changes: 3 additions & 3 deletions include/denovo_discovery/candidate_region.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ class CandidateRegion {

std::vector<Interval> identify_low_coverage_intervals(
const std::vector<uint32_t>& covg_at_each_position,
const uint32_t& min_required_covg = 2, const uint32_t& min_length = 1,
const uint32_t& max_length = 20);
const uint32_t& min_required_covg = 3, const uint32_t& min_length = 1,
const uint32_t& max_length = 76);

using CandidateRegions = std::unordered_map<CandidateRegionIdentifier, CandidateRegion>;

Expand All @@ -107,6 +107,6 @@ PileupConstructionMap construct_pileup_construction_map(

void load_all_candidate_regions_pileups_from_fastq(const fs::path& reads_filepath,
const CandidateRegions& candidate_regions,
const PileupConstructionMap& pileup_construction_map, const uint32_t threads = 1);
const PileupConstructionMap& pileup_construction_map, uint32_t threads = 1);

#endif // PANDORA_CANDIDATE_REGION_H
12 changes: 10 additions & 2 deletions include/interval.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,18 @@

#include <cstdint>
#include <iostream>
#include <vector>
#include <limits>
#include <cassert>
#include <algorithm>

struct Interval {
uint32_t start;
uint32_t length;
// in pilot, longest prg was 208,562 characters long
uint32_t length; // in pilot, longest prg was 208,562 characters long

Interval(uint32_t = 0, uint32_t = 0);

// non-inclusive
uint32_t get_end() const;

friend std::ostream& operator<<(std::ostream& out, const Interval& i);
Expand All @@ -26,4 +30,8 @@ struct Interval {
bool empty() const;
};

// Merge intervals within dist of each other. Changes the vector inplace.
// Time complexity: O(N log(N)) Sort the vector, then a single linear pass through
void merge_intervals_within(std::vector<Interval>&, uint32_t);

#endif
30 changes: 13 additions & 17 deletions src/denovo_discovery/candidate_region.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,30 +108,30 @@ CandidateRegions find_candidate_regions_for_pan_node(
i++;
}

const auto candidate_intervals { identify_low_coverage_intervals(
auto candidate_intervals { identify_low_coverage_intervals(
covgs_along_localnode_path) };
BOOST_LOG_TRIVIAL(trace) << candidate_intervals.size()
<< " candidate intervals before merging";
merge_intervals_within(candidate_intervals, candidate_region_interval_padding);
BOOST_LOG_TRIVIAL(trace) << candidate_intervals.size()
<< " candidate intervals after merging";

CandidateRegions candidate_regions;

BOOST_LOG_TRIVIAL(debug) << "there are " << candidate_intervals.size()
<< " intervals";

for (const auto& current_interval : candidate_intervals) {
CandidateRegion candidate_region { current_interval, pangraph_node->get_name(),
candidate_region_interval_padding };
BOOST_LOG_TRIVIAL(debug)
<< "Looking at interval: " << candidate_region.get_interval();
BOOST_LOG_TRIVIAL(debug) << "For gene: " << candidate_region.get_name();

const auto interval_path_components { find_interval_and_flanks_in_localpath(
candidate_region.get_interval(), local_node_max_likelihood_path) };

candidate_region.read_coordinates = pangraph_node->get_read_overlap_coordinates(
interval_path_components.slice);

BOOST_LOG_TRIVIAL(debug)
<< "there are " << candidate_region.read_coordinates.size()
<< " read_overlap_coordinates";
BOOST_LOG_TRIVIAL(trace)
<< "Candidate region with interval " << candidate_region.get_interval()
<< " for gene " << candidate_region.get_name() << " has "
<< candidate_region.read_coordinates.size() << " read overlaps";

candidate_region.max_likelihood_sequence
= local_prg->string_along_path(interval_path_components.slice);
Expand All @@ -152,7 +152,7 @@ std::vector<Interval> identify_low_coverage_intervals(
{
std::vector<Interval> identified_regions;
const auto predicate { [min_required_covg](
uint_least32_t x) { return x <= min_required_covg; } };
uint_least32_t x) { return x < min_required_covg; } };
auto first { covg_at_each_position.begin() };
auto previous { covg_at_each_position.begin() };
auto current { first };
Expand Down Expand Up @@ -204,7 +204,6 @@ void CandidateRegion::add_pileup_entry(
void CandidateRegion::write_denovo_paths_to_file(const fs::path& output_directory)
{
if (denovo_paths.empty()) {
BOOST_LOG_TRIVIAL(debug) << "No denovo paths for " << filename;
return;
}

Expand All @@ -213,9 +212,6 @@ void CandidateRegion::write_denovo_paths_to_file(const fs::path& output_director
const auto discovered_paths_filepath { output_directory / filename };

fasta.save(discovered_paths_filepath.string());

BOOST_LOG_TRIVIAL(debug) << "Denovo path for " << name
<< " written to: " << discovered_paths_filepath;
}

Fastaq CandidateRegion::generate_fasta_for_denovo_paths()
Expand Down Expand Up @@ -330,6 +326,6 @@ void load_all_candidate_regions_pileups_from_fastq(const fs::path& reads_filepat
}
}
}
BOOST_LOG_TRIVIAL(info) << "Loaded all candidate regions pileups from "
<< reads_filepath.string();
BOOST_LOG_TRIVIAL(trace) << "Loaded all candidate regions pileups from "
<< reads_filepath.string();
}
39 changes: 35 additions & 4 deletions src/interval.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#include <limits>
#include <cassert>
#include "interval.h"

#define assert_msg(x) !(std::cerr << "Assertion failed: " << x << std::endl)
Expand All @@ -14,7 +12,7 @@ Interval::Interval(uint32_t s, uint32_t e)
// strings can be represented
}

uint32_t Interval::get_end() const { return start + (uint32_t)length; }
uint32_t Interval::get_end() const { return start + length; }

std::ostream& operator<<(std::ostream& out, Interval const& i)
{
Expand Down Expand Up @@ -64,4 +62,37 @@ bool Interval::operator<(const Interval& y) const
return false;
}

bool Interval::empty() const { return length == 0; }
bool Interval::empty() const { return length == 0; }

void merge_intervals_within(std::vector<Interval>& intervals, const uint32_t dist)
{
if (intervals.size() < 2) { // nothing to merge
return;
}
std::sort(intervals.begin(), intervals.end());

// assumption that iv1 < iv2
auto close { [dist](const Interval& iv1, const Interval& iv2) -> bool {
const auto leading_edge { iv1.get_end() + dist };
return leading_edge > iv2.start;
} };

unsigned long prev_idx { 0 };
for (std::size_t i = 1; i < intervals.size(); ++i) {
const auto& current_iv { intervals[i] };
auto& prev_iv { intervals[prev_idx] };

if (close(prev_iv, current_iv)) {
const auto new_end { std::max(prev_iv.get_end(), current_iv.get_end()) };
intervals[prev_idx] = Interval(prev_iv.start, new_end);

// remove the current interval as it was merged into the previous interval
intervals.erase(intervals.begin() + i);
--i; // vector elements get relocated after erase so stay at current idx
} else {
++prev_idx;
}
}

intervals.resize(prev_idx + 1);
}
7 changes: 4 additions & 3 deletions src/map_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,11 +276,11 @@ int pandora_map(MapOptions& opt)
<< "/pandora.pangraph.gfa";
write_pangraph_gfa(opt.outdir + "/pandora.pangraph.gfa", pangraph);

BOOST_LOG_TRIVIAL(info) << "Update LocalPRGs with hits...";
BOOST_LOG_TRIVIAL(info) << "Updating local PRGs with hits...";
uint32_t sample_id = 0;
pangraph->add_hits_to_kmergraphs(prgs);

BOOST_LOG_TRIVIAL(info) << "Estimate parameters for kmer graph model...";
BOOST_LOG_TRIVIAL(info) << "Estimating parameters for kmer graph model...";
auto exp_depth_covg = estimate_parameters(pangraph, opt.outdir, opt.kmer_size,
opt.error_rate, covg, opt.binomial, sample_id);
genotyping_options.add_exp_depth_covg(exp_depth_covg);
Expand Down Expand Up @@ -389,7 +389,8 @@ int pandora_map(MapOptions& opt)
// build the pileup for candidate regions multithreadly
if (opt.discover) {
BOOST_LOG_TRIVIAL(info)
<< "Building read pileups for candidate de novo regions...";
<< "Building read pileups for " << candidate_regions.size()
<< " candidate de novo regions...";
// the construct_pileup_construction_map function is intentionally left
// single threaded since it would require too much synchronization
const auto pileup_construction_map
Expand Down
Loading

0 comments on commit 5e1c0f9

Please sign in to comment.