Skip to content

Commit

Permalink
trade space for time and store edges in their own hacked vector
Browse files Browse the repository at this point in the history
  • Loading branch information
ekg committed Apr 25, 2020
1 parent 5e18490 commit 52dcdf3
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 82 deletions.
100 changes: 38 additions & 62 deletions src/node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,55 +5,33 @@
namespace odgi {

uint64_t node_t::sequence_size(void) const {
return seq_bytes();
return sequence.size();
}

const std::string node_t::sequence(void) const {
const std::string res((char*)bytes.data()+seq_start(), seq_bytes());
return res;
const std::string node_t::get_sequence(void) const {
return sequence;
}

void node_t::set_sequence(const std::string& seq) {
if (seq.size() > seq_bytes()) {
bytes.reserve(bytes.size()+seq.size()-seq_bytes());
bytes.insert(bytes.begin()+seq_start(), seq.size() - seq_bytes(), 0);
set_seq_bytes(seq.size());
} else if (seq.size() < seq_bytes()) {
bytes.erase(bytes.begin()+seq_start(), bytes.begin()+seq_start()+(seq_bytes()-seq.size()));;
set_seq_bytes(seq.size());
}
memcpy(bytes.data()+seq_start(), seq.c_str(), seq.size());
sequence = seq;
}

std::vector<uint64_t> node_t::edges(void) const {
std::vector<uint64_t> res;
if (edge_count()) {
res.resize(edge_count()*EDGE_RECORD_LENGTH);
sqvarint::decode(res.data(),
(uint8_t*)bytes.data()+edge_start(),
edge_count()*EDGE_RECORD_LENGTH);
}
return res;
const dyn::hacked_vector& node_t::get_edges(void) const {
return edges;
}

void node_t::add_edge(const uint64_t& relative_id, const uint64_t& edge_type) {
//std::cerr << "add edge " << "relative_id " << relative_id << " edge_type " << edge_type << std::endl;
uint64_t add_edge_bytes = sqvarint::length({relative_id, edge_type});
bytes.reserve(bytes.size()+add_edge_bytes);
bytes.insert(bytes.begin()+edge_start(), add_edge_bytes, 0);
sqvarint::encode({relative_id, edge_type}, bytes.data()+edge_start());
set_edge_bytes(edge_bytes() + add_edge_bytes);
set_edge_count(edge_count() + 1);
edges.push_back(relative_id);
edges.push_back(edge_type);
}

void node_t::remove_edge(const uint64_t& rank) {
assert(rank < edge_count());
uint64_t edge_offset = edge_start() + sqvarint::bytes(bytes.data()+edge_start(), EDGE_RECORD_LENGTH*rank);
// a bit redundant
uint64_t j = sqvarint::bytes(bytes.data()+edge_offset, EDGE_RECORD_LENGTH);
bytes.erase(bytes.begin()+edge_offset, bytes.begin()+edge_offset+j);
set_edge_count(edge_count()-1);
set_edge_bytes(edge_bytes()-j);
uint64_t offset = EDGE_RECORD_LENGTH*rank;
for (uint8_t i = 0; i < EDGE_RECORD_LENGTH; ++i) {
edges.remove(offset);
}
}

void node_t::add_path_step(const uint64_t& path_id, const bool& is_rev,
Expand Down Expand Up @@ -139,56 +117,54 @@ void node_t::remove_path_step(const uint64_t& rank) {
}

void node_t::clear(void) {
set_seq_bytes(0);
set_edge_bytes(0);
set_edge_count(0);
bytes.clear();
sequence.clear();
clear_edges();
clear_path_steps();
}

void node_t::clear_edges(void) {
dyn::hacked_vector null_iv;
edges = null_iv;
}

void node_t::clear_path_steps(void) {
dyn::hacked_vector null_iv;
path_steps = null_iv;
}

uint64_t node_t::serialize(std::ostream& out) const {
uint64_t written = 0;
out.write((char*)&_seq_bytes, sizeof(uint32_t));
out.write((char*)&_edge_bytes, sizeof(uint32_t));
out.write((char*)&_edge_count, sizeof(uint32_t));
written += sizeof(uint32_t)*4 + sizeof(uint8_t);
uint64_t node_size = bytes.size();
out.write((char*)&node_size, sizeof(node_size));
written += sizeof(uint64_t);
out.write((char*)bytes.data(), node_size*sizeof(uint8_t));
written += sizeof(uint8_t)*node_size;
size_t seq_size = sequence.size();
out.write((char*)&seq_size, sizeof(size_t));
written += sizeof(size_t);
out << sequence;
written += sequence.size();
written += edges.serialize(out);
written += path_steps.serialize(out);
return written;
}

void node_t::load(std::istream& in) {
in.read((char*)&_seq_bytes, sizeof(uint32_t));
in.read((char*)&_edge_bytes, sizeof(uint32_t));
in.read((char*)&_edge_count, sizeof(uint32_t));
uint64_t node_size = 0;
in.read((char*)&node_size, sizeof(node_size));
bytes.resize(node_size);
in.read((char*)bytes.data(), node_size*sizeof(uint8_t));
size_t seq_size;
in.read((char*)&seq_size, sizeof(size_t));
sequence.resize(seq_size);
in.read((char*)sequence.c_str(), seq_size);
edges.load(in);
path_steps.load(in);
}

void node_t::display(void) const {
std::cerr << "self_bytes " << bytes.size() << " "
<< "seq_bytes " << seq_bytes() << " "
<< "seq " << sequence() << " "
<< "edge_start " << edge_start() << " "
std::cerr << "seq " << sequence << " "
<< "edge_count " << edge_count() << " "
<< "edge_bytes " << edge_bytes() << " "
<< "path_count " << path_count() << " | ";
for (auto i : bytes) {
std::cerr << (int) i << " ";
if (edge_count()) {
for (uint64_t i = 0; i < edge_count(); ++i) {
std::cerr
<< edges.at(i) << ":"
<< edges.at(i+1) << " ";
}
}
std::cerr << " | ";
std::cerr << "| ";
if (path_count()) {
for (uint64_t i = 0; i < path_count(); ++i) {
std::cerr
Expand Down
20 changes: 7 additions & 13 deletions src/node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,14 @@ const uint8_t PATH_RECORD_LENGTH = 5;

/// A node object with the sequence, its edge lists, and paths
class node_t {
std::vector<uint8_t> bytes;
std::string sequence;
dyn::hacked_vector edges;
dyn::hacked_vector path_steps;
uint32_t _seq_bytes = 0;
uint32_t _edge_bytes = 0;
uint32_t _edge_count = 0;
public:
inline const uint64_t seq_start(void) const { return 0; }
inline const uint64_t seq_bytes(void) const { return _seq_bytes; }
inline const uint64_t edge_start(void) const { return _seq_bytes; }
inline const uint64_t edge_count(void) const { return _edge_count; }
inline const uint64_t edge_bytes(void) const { return _edge_bytes; }
inline const uint64_t seq_bytes(void) const { return sequence.size(); }
inline const uint64_t edge_count(void) const { return edges.size()/EDGE_RECORD_LENGTH; }
inline const uint64_t path_count(void) const { return path_steps.size()/PATH_RECORD_LENGTH; }
inline void set_seq_bytes(const uint64_t& i) { _seq_bytes = i; }
inline void set_edge_count(const uint64_t& i) { _edge_count = i; }
inline void set_edge_bytes(const uint64_t& i) { _edge_bytes = i; }
struct step_t {
uint64_t data[5] = { 0, 0, 0, 0, 0 }; // PATH_RECORD_LENGTH
step_t(void) { }
Expand Down Expand Up @@ -65,9 +58,9 @@ class node_t {
inline void set_next_rank(const uint64_t& i) { data[4] = i; }
};
uint64_t sequence_size(void) const;
const std::string sequence(void) const;
const std::string get_sequence(void) const;
void set_sequence(const std::string& seq);
std::vector<uint64_t> edges(void) const;
const dyn::hacked_vector& get_edges(void) const;
void add_edge(const uint64_t& relative_id, const uint64_t& edge_type);
void remove_edge(const uint64_t& rank);
void add_path_step(const uint64_t& path_id, const bool& is_rev,
Expand All @@ -86,6 +79,7 @@ class node_t {
void remove_path_step(const uint64_t& rank);
void update_path_last_bytes(void);
void clear(void);
void clear_edges(void);
void clear_path_steps(void);
uint64_t serialize(std::ostream& out) const;
void load(std::istream& in);
Expand Down
14 changes: 7 additions & 7 deletions src/odgi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ size_t graph_t::get_length(const handle_t& handle) const {

/// Get the sequence of a node, presented in the handle's local forward orientation.
std::string graph_t::get_sequence(const handle_t& handle) const {
auto& seq = node_v.at(number_bool_packing::unpack_number(handle)).sequence();
auto& seq = node_v.at(number_bool_packing::unpack_number(handle)).get_sequence();
return (get_is_reverse(handle) ? reverse_complement(seq) : seq);
}

Expand All @@ -60,7 +60,7 @@ bool graph_t::follow_edges_impl(const handle_t& handle, bool go_left, const std:
const node_t& node = node_v.at(number_bool_packing::unpack_number(handle));
bool is_rev = get_is_reverse(handle);
nid_t node_id = get_id(handle);
const std::vector<uint64_t> node_edges = node.edges();
auto& node_edges = node.get_edges();
if (node_edges.size() == 0) return true;
for (uint64_t i = 0; i < node_edges.size(); i+=2) {
// unpack the edge
Expand Down Expand Up @@ -603,7 +603,7 @@ void graph_t::destroy_edge(const handle_t& left_h, const handle_t& right_h) {
nid_t right_node_id = get_id(right_h);
nid_t left_node_id = get_id(left_h);

std::vector<uint64_t> left_node_edges = left_node.edges();
auto& left_node_edges = left_node.get_edges();
bool found_edge = false;
for (uint64_t i = 0; i < left_node_edges.size(); ) {
uint64_t other_id = edge_delta_to_id(left_node_id, left_node_edges.at(i++));
Expand All @@ -622,7 +622,7 @@ void graph_t::destroy_edge(const handle_t& left_h, const handle_t& right_h) {
}
}

std::vector<uint64_t> right_node_edges = right_node.edges();
auto& right_node_edges = right_node.get_edges();
for (uint64_t i = 0; i < right_node_edges.size(); ) {
uint64_t other_id = edge_delta_to_id(right_node_id, right_node_edges.at(i++));
uint8_t packed_edge = right_node_edges.at(i++);
Expand Down Expand Up @@ -1332,8 +1332,8 @@ void graph_t::display(void) const {
for (uint64_t i = 0; i < node_v.size(); ++i) {
auto& node = node_v.at(i);
nid_t node_id = i+1;
std::cerr << node_id << ":" << node.sequence() << " ";
const std::vector<uint64_t> node_edges = node.edges();
std::cerr << node_id << ":" << node.get_sequence() << " ";
auto& node_edges = node.get_edges();
for (uint64_t j = 0; j < node_edges.size(); ++j) {
std::cerr << node_edges.at(j) << ",";
}
Expand Down Expand Up @@ -1381,7 +1381,7 @@ void graph_t::to_gfa(std::ostream& out) const {
const node_t& node = node_v.at(number_bool_packing::unpack_number(h));
bool is_rev = get_is_reverse(h);
nid_t node_id = get_id(h);
const std::vector<uint64_t> node_edges = node.edges();
auto& node_edges = node.get_edges();
for (uint64_t i = 0; i < node_edges.size(); i+=2) {
// unpack the edge
uint64_t other_id = edge_delta_to_id(node_id, node_edges.at(i));
Expand Down

0 comments on commit 52dcdf3

Please sign in to comment.