Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closes #151 #152

Merged
merged 4 commits into from
Jul 3, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 19 additions & 5 deletions include/kmergraphwithcoverage.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class KmerGraphWithCoverage {
private:
//each coverage is represented as a std::vector, each position of the vector representing the coverage of a sample
//each coverage of a sample is represented by a std::pair, for the forward and reverse reads coverage of that sample
std::vector<std::vector<std::pair<uint32_t, uint32_t>>> nodeIndex2SampleCoverage;
std::vector<std::vector<std::pair<uint16_t, uint16_t>>> nodeIndex2SampleCoverage;
leoisl marked this conversation as resolved.
Show resolved Hide resolved
uint32_t exp_depth_covg;
float binomial_parameter_p;
float negative_binomial_parameter_p;
Expand All @@ -47,23 +47,37 @@ class KmerGraphWithCoverage {
KmerGraphWithCoverage& operator=(KmerGraphWithCoverage&& other) = default; //move assignment operator
virtual ~KmerGraphWithCoverage() = default;

//getter
//getters
/*
TODO: we should return a uint16_t instead of a uint32_t, but I did not want to change the interface, as it can be dangerous
WARNING: some parts of the code accumulates under the type of this return value,
so it might be dangerous to change to uint16_t without careful analysis.
e.g.: I can see summing a bunch of coverage and potentially getting a value larger than 65535,
which would incur overflow and bugs.
NOTE: converting uint16_t to uint32_t is safe anyway, so it is fine to leave like this
NOTE: the advantage of returning a uint16_t here is a negligible improvement in RAM usage, so I don't think it is
worth the risk now
TODO: leaving return type as uint32_t to be safe for now, need a recheck later
*/
uint32_t get_covg(uint32_t node_id, bool strand, uint32_t sample_id) const;
uint32_t get_num_reads() const { return num_reads; }
uint32_t get_total_number_samples() const {return total_number_samples; }

//setters
void increment_covg(uint32_t node_id, bool strand, uint32_t sample_id);
void set_covg(uint32_t node_id, uint32_t value, bool strand, uint32_t sample_id);
void set_covg(uint32_t node_id, uint16_t value, bool strand, uint32_t sample_id);
leoisl marked this conversation as resolved.
Show resolved Hide resolved
void set_exp_depth_covg(const uint32_t);
void set_binomial_parameter_p(const float);
void set_negative_binomial_parameters(const float &, const float &);
void set_thresh (int thresh) { this->thresh = thresh; }
void set_num_reads(uint32_t num_reads) { this->num_reads = num_reads; }

void zeroCoverages() {
for (auto &sampleCoverage: nodeIndex2SampleCoverage)
sampleCoverage = std::vector<std::pair<uint32_t, uint32_t>>(total_number_samples);
for (auto &sampleCoverage: nodeIndex2SampleCoverage) {
leoisl marked this conversation as resolved.
Show resolved Hide resolved
sampleCoverage = std::vector<std::pair<uint16_t, uint16_t>>(total_number_samples);
sampleCoverage.shrink_to_fit(); //tries to make this information as compact as possible (TODO: use sdsl?)
}

}

float nbin_prob(uint32_t, const uint32_t &sample_id);
Expand Down
24 changes: 16 additions & 8 deletions src/kmergraphwithcoverage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <cstdio> /* NULL */
#include <cstdlib> /* srand, rand */
#include <cmath>
#include <cstdint>

#include <boost/math/distributions/negative_binomial.hpp>
#include <boost/log/trivial.hpp>
Expand Down Expand Up @@ -36,10 +37,15 @@ void KmerGraphWithCoverage::set_binomial_parameter_p(const float e_rate) {
void KmerGraphWithCoverage::increment_covg(uint32_t node_id, bool strand, uint32_t sample_id) {
assert(this->nodeIndex2SampleCoverage[node_id].size() > sample_id);

//get a pointer to the value we want to increment
uint16_t* coverage_ptr = nullptr;
if (strand)
this->nodeIndex2SampleCoverage[node_id][sample_id].first++;
coverage_ptr = &(this->nodeIndex2SampleCoverage[node_id][sample_id].first);
else
this->nodeIndex2SampleCoverage[node_id][sample_id].second++;
coverage_ptr = &(this->nodeIndex2SampleCoverage[node_id][sample_id].second);

if ((*coverage_ptr) < UINT16_MAX) //checks if it is safe to increase the coverage (no overflow)
++(*coverage_ptr);
}

uint32_t KmerGraphWithCoverage::get_covg(uint32_t node_id, bool strand, uint32_t sample_id) const {
Expand All @@ -48,12 +54,12 @@ uint32_t KmerGraphWithCoverage::get_covg(uint32_t node_id, bool strand, uint32_t
return 0;

if (strand)
return this->nodeIndex2SampleCoverage[node_id][sample_id].first;
return (uint32_t)(this->nodeIndex2SampleCoverage[node_id][sample_id].first);
else
return this->nodeIndex2SampleCoverage[node_id][sample_id].second;
return (uint32_t)(this->nodeIndex2SampleCoverage[node_id][sample_id].second);
}

void KmerGraphWithCoverage::set_covg(uint32_t node_id, uint32_t value, bool strand, uint32_t sample_id) {
void KmerGraphWithCoverage::set_covg(uint32_t node_id, uint16_t value, bool strand, uint32_t sample_id) {
assert(this->nodeIndex2SampleCoverage[node_id].size() > sample_id);
if (strand)
this->nodeIndex2SampleCoverage[node_id][sample_id].first = value;
Expand Down Expand Up @@ -416,6 +422,7 @@ void KmerGraphWithCoverage::save(const std::string &filepath, const std::shared_
}

//TODO: THIS SHOULD BE RECODED, WE ARE DUPLICATING CODE HERE (SEE KmerGraph::load())!!!
//TODO: remove this method?
void KmerGraphWithCoverage::load(const std::string &filepath) {
//TODO: this might be dangerous, recode this?
auto kmer_prg = const_cast<KmerGraph*>(this->kmer_prg);
Expand All @@ -425,7 +432,8 @@ void KmerGraphWithCoverage::load(const std::string &filepath) {
std::string line;
std::vector<std::string> split_line;
std::stringstream ss;
uint32_t id = 0, covg, from, to;
uint32_t id = 0, from, to;
uint16_t covg;
prg::Path p;
uint32_t num_nodes = 0;

Expand Down Expand Up @@ -466,9 +474,9 @@ void KmerGraphWithCoverage::load(const std::string &filepath) {
if (kmer_prg->k == 0 and p.length() > 0) {
kmer_prg->k = p.length();
}
covg = stoi(split(split_line[3], "FC:i:")[0]);
covg = (uint16_t)(stoul(split(split_line[3], "FC:i:")[0]));
set_covg(n->id, covg, 0, sample_id);
covg = stoi(split(split_line[4], "RC:i:")[0]);
covg = (uint16_t)(stoul(split(split_line[4], "RC:i:")[0]));
set_covg(n->id, covg, 1, sample_id);
if (split_line.size() >= 6) {
n->num_AT = std::stoi(split_line[5]);
Expand Down
4 changes: 2 additions & 2 deletions src/pangenome/pangraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -346,8 +346,8 @@ void pangenome::Graph::copy_coverages_to_kmergraphs(const Graph &ref_pangraph, c
for (auto & kmergraph_node_ptr : pangraph_node.kmer_prg_with_coverage.kmer_prg->nodes){
const auto &knode_id = kmergraph_node_ptr->id;
assert(knode_id < ref_node.kmer_prg_with_coverage.kmer_prg->nodes.size());
pangraph_node.kmer_prg_with_coverage.set_covg(knode_id, ref_node.kmer_prg_with_coverage.get_covg(knode_id, 0, ref_sample_id), 0, sample_id);
pangraph_node.kmer_prg_with_coverage.set_covg(knode_id, ref_node.kmer_prg_with_coverage.get_covg(knode_id, 1, ref_sample_id), 1, sample_id);
pangraph_node.kmer_prg_with_coverage.set_covg(knode_id, (uint16_t)(ref_node.kmer_prg_with_coverage.get_covg(knode_id, 0, ref_sample_id)), 0, sample_id);
pangraph_node.kmer_prg_with_coverage.set_covg(knode_id, (uint16_t)(ref_node.kmer_prg_with_coverage.get_covg(knode_id, 1, ref_sample_id)), 1, sample_id);
}
}
}
Expand Down
86 changes: 86 additions & 0 deletions test/kmergraphwithcoverage_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <stdint.h>
#include <iostream>
#include <cmath>
#include <cstdint>

using namespace prg;

Expand Down Expand Up @@ -105,6 +106,91 @@ TEST(KmerGraphWithCoverageTest, assign) {
EXPECT_EQ(kmergraph2, kmergraph2);
}*/

TEST(KmerGraphWithCoverageTest, get_and_set_increment_covg){
//create a kmergraph with 5 nodes
const uint32_t nb_of_nodes = 5;
KmerGraph kmergraph;

auto create_kmergraph_helper = [&](uint32_t node_index) {
std::deque<Interval> interval = {Interval(node_index, node_index)};
prg::Path path;
path.initialize(interval);
kmergraph.add_node(path);
};

for (uint32_t node_index=0; node_index < nb_of_nodes; ++node_index)
create_kmergraph_helper(node_index);

//create a KmerGraphWithCoverage on 10 samples
const uint32_t nb_of_samples = 10;
KmerGraphWithCoverage kmergraph_with_coverage(&kmergraph, nb_of_samples);

//function that will help to check the coverages in the tests
using Nodeindex_Strand_Sampleindex_Tuple = std::tuple<uint32_t, bool, uint32_t>;
std::map<Nodeindex_Strand_Sampleindex_Tuple, uint16_t> expected_coverage;
auto check_coverages = [&]() {
//verifies all <node, strand, sample> coverage matches expected_coverage
//if the tuple does not exist, then the expected_coverage is assumed to be 0 (as per std::map::operator[] and uint16_t default constructor)
for (uint32_t node_index=0; node_index < nb_of_nodes; ++node_index) {
for (uint32_t sample_index=0; sample_index<nb_of_samples; ++sample_index) {
EXPECT_EQ(kmergraph_with_coverage.get_covg(node_index, 0, sample_index), expected_coverage[make_tuple(node_index, 0, sample_index)]);
EXPECT_EQ(kmergraph_with_coverage.get_covg(node_index, 1, sample_index), expected_coverage[make_tuple(node_index, 1, sample_index)]);
}
}
};

//1. expect all coverages to be 0
check_coverages();

//2. lets set some random coverages
auto set_covg_helper = [&] (uint32_t node_id, uint16_t value, bool strand, uint32_t sample_id) {
kmergraph_with_coverage.set_covg(node_id, value, strand, sample_id);
expected_coverage[make_tuple(node_id, strand, sample_id)] = value;
};
set_covg_helper(0,100,0,5);
set_covg_helper(0,150,1,5);
set_covg_helper(1,789,1,9);
set_covg_helper(2,120,0,3);
set_covg_helper(3,130,0,2);
set_covg_helper(4,780,1,7);
check_coverages();

//3. setup the maximum coverage
set_covg_helper(4,UINT16_MAX,0,1);
set_covg_helper(4,UINT16_MAX,1,1);
check_coverages();

//4. setup the minimum coverage
set_covg_helper(3,0,0,1);
set_covg_helper(3,0,1,1);
check_coverages();

//5. test increment coverage
auto increment_covg_helper = [&] (uint32_t node_id, bool strand, uint32_t sample_id) {
auto old_covg = kmergraph_with_coverage.get_covg(node_id, strand, sample_id);
kmergraph_with_coverage.increment_covg(node_id, strand, sample_id);
expected_coverage[make_tuple(node_id, strand, sample_id)] = old_covg+1;
};
increment_covg_helper(0,0,5);
increment_covg_helper(0,1,5);
increment_covg_helper(1,1,9);
increment_covg_helper(2,0,3);
increment_covg_helper(3,0,2);
increment_covg_helper(4,1,7);
increment_covg_helper(3,0,1);
increment_covg_helper(3,1,1);
check_coverages();


//6. test incrementing coverage above the maximum value
set_covg_helper(2,UINT16_MAX,0,9); //first set to max value
check_coverages();
kmergraph_with_coverage.increment_covg(2, 0, 9); //now try to increment
EXPECT_EQ(kmergraph_with_coverage.get_covg(2, 0, 9), UINT16_MAX); //should still be UINT16_MAX
check_coverages(); //to be sure nothing changed
}


TEST(KmerGraphWithCoverageTest, set_exp_depth_covg){
KmerGraph kmergraph;
KmerGraphWithCoverage kmergraph_with_coverage(&kmergraph);
Expand Down