From ac94f9eb892a9836e9d3a2cb593732f22fca5234 Mon Sep 17 00:00:00 2001 From: Michael Hall Date: Wed, 21 Oct 2020 17:43:03 +1000 Subject: [PATCH 1/9] exclude nonessential GATB files and limit kszie --- ext/gatb.cmake | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/ext/gatb.cmake b/ext/gatb.cmake index c587b9d9..f4a60c3f 100644 --- a/ext/gatb.cmake +++ b/ext/gatb.cmake @@ -1,30 +1,15 @@ include(ExternalProject) -execute_process(COMMAND mkdir -p ${CMAKE_BINARY_DIR}/ext) -execute_process(COMMAND mkdir -p ${CMAKE_BINARY_DIR}/bin) -execute_process(COMMAND mkdir -p ${CMAKE_BINARY_DIR}/lib) -execute_process(COMMAND mkdir -p ${CMAKE_BINARY_DIR}/include) - -set(GATB_DIR ${CMAKE_BINARY_DIR}/ext/gatb-core-1.4.1/gatb-core) -set(GATB_BUILD_DIR ${GATB_DIR}/build) -execute_process(COMMAND mkdir -p ${GATB_BUILD_DIR}) +# We don't want to install some GATB-CORE artifacts +SET (GATB_CORE_EXCLUDE_TOOLS 1) +SET (GATB_CORE_EXCLUDE_TESTS 1) +SET (GATB_CORE_INCLUDE_EXAMPLES 1) ExternalProject_Add(gatb - DOWNLOAD_COMMAND wget https://github.com/GATB/gatb-core/archive/v1.4.1.tar.gz --timestamping -O gatb.tar.gz - DOWNLOAD_DIR "${CMAKE_BINARY_DIR}/ext" - CONFIGURE_COMMAND "" - BUILD_COMMAND bash -c "cd ${GATB_BUILD_DIR} && cmake -D CMAKE_C_COMPILER=${CMAKE_C_COMPILER} -D CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} ${GATB_DIR} && make" - INSTALL_COMMAND "" - TEST_COMMAND "") - -ExternalProject_Add_Step(gatb extract_tar - COMMAND tar -xvzf ${CMAKE_BINARY_DIR}/ext/gatb.tar.gz -C ${CMAKE_BINARY_DIR}/ext - DEPENDEES download - DEPENDERS configure) + GIT_REPOSITORY https://github.com/GATB/gatb-core.git + GIT_TAG "v1.4.1" + PREFIX "${CMAKE_CURRENT_BINARY_DIR}/gatb" + SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/gatb/src/gatb/gatb-core/gatb-core" + CMAKE_ARGS -DKSIZE_LIST=32 + INSTALL_COMMAND "") -ExternalProject_Add_Step(gatb copy - COMMAND bash -c "cp -r ${GATB_BUILD_DIR}/bin/* ${CMAKE_BINARY_DIR}/bin/" - COMMAND bash -c "cp -r ${GATB_BUILD_DIR}/lib/* ${CMAKE_BINARY_DIR}/lib/" - COMMAND bash -c "cp -r ${GATB_BUILD_DIR}/include/* ${CMAKE_BINARY_DIR}/include/" - COMMAND bash -c "cp -r ${GATB_DIR}/src/gatb/* ${CMAKE_BINARY_DIR}/include/gatb/" - DEPENDEES build) From a40c3b36ddd0acbc4bf3554c8f9105363047e186 Mon Sep 17 00:00:00 2001 From: Michael Hall Date: Thu, 22 Oct 2020 16:18:20 +1000 Subject: [PATCH 2/9] restrict denovo kmer size --- include/denovo_discovery/denovo_discovery.h | 4 ++-- include/denovo_discovery/discover_main.h | 3 ++- src/denovo_discovery/denovo_discovery.cpp | 2 +- src/denovo_discovery/discover_main.cpp | 1 + 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/denovo_discovery/denovo_discovery.h b/include/denovo_discovery/denovo_discovery.h index 0c28720b..32613954 100644 --- a/include/denovo_discovery/denovo_discovery.h +++ b/include/denovo_discovery/denovo_discovery.h @@ -18,7 +18,7 @@ class DenovoDiscovery { bool clean_assembly_graph { false }; const uint16_t max_insertion_size; - DenovoDiscovery(const uint32_t& kmer_size, const double& read_error_rate, + DenovoDiscovery(const uint16_t& kmer_size, const double& read_error_rate, uint16_t max_insertion_size = 15, uint16_t min_covg_for_node_in_assembly_graph = 1); void find_paths_through_candidate_region( @@ -28,7 +28,7 @@ class DenovoDiscovery { const uint32_t& read_covg, const uint32_t& ref_length) const; private: - const uint32_t kmer_size; + const uint16_t kmer_size; const double read_error_rate; }; diff --git a/include/denovo_discovery/discover_main.h b/include/denovo_discovery/discover_main.h index 1188bb9b..408fcd42 100644 --- a/include/denovo_discovery/discover_main.h +++ b/include/denovo_discovery/discover_main.h @@ -13,6 +13,7 @@ #include "denovo_discovery/candidate_region.h" #include "denovo_discovery/denovo_discovery.h" +constexpr auto MAX_DENOVO_K { 32 }; namespace fs = boost::filesystem; /// Collection of all options of discover subcommand. @@ -33,7 +34,7 @@ struct DiscoverOptions { bool clean { false }; bool binomial { false }; uint32_t max_covg { 600 }; - uint32_t denovo_kmer_size { 11 }; + uint16_t denovo_kmer_size { 11 }; uint16_t max_insertion_size { 15 }; uint16_t min_covg_for_node_in_assembly_graph { 2 }; uint32_t min_candidate_covg { 3 }; diff --git a/src/denovo_discovery/denovo_discovery.cpp b/src/denovo_discovery/denovo_discovery.cpp index f13e720f..049196e2 100644 --- a/src/denovo_discovery/denovo_discovery.cpp +++ b/src/denovo_discovery/denovo_discovery.cpp @@ -132,7 +132,7 @@ void DenovoDiscovery::find_paths_through_candidate_region( remove_graph_file(GATB_graph_filepath); } -DenovoDiscovery::DenovoDiscovery(const uint32_t& kmer_size, +DenovoDiscovery::DenovoDiscovery(const uint16_t& kmer_size, const double& read_error_rate, const uint16_t max_insertion_size, const uint16_t min_covg_for_node_in_assembly_graph) : min_covg_for_node_in_assembly_graph { min_covg_for_node_in_assembly_graph } diff --git a/src/denovo_discovery/discover_main.cpp b/src/denovo_discovery/discover_main.cpp index ad82373e..b821c02d 100644 --- a/src/denovo_discovery/discover_main.cpp +++ b/src/denovo_discovery/discover_main.cpp @@ -107,6 +107,7 @@ void setup_discover_subcommand(CLI::App& app) ->add_option("--discover-k", opt->denovo_kmer_size, "K-mer size to use when discovering novel variants") ->capture_default_str() + ->check(CLI::Range(0, MAX_DENOVO_K)) ->type_name("INT"); std::string description = "Max. insertion size for novel variants. Warning: " From f72f62151c7c86573252d9758843705ab41f3876 Mon Sep 17 00:00:00 2001 From: Michael Hall Date: Thu, 22 Oct 2020 17:00:27 +1000 Subject: [PATCH 3/9] expose discover opt to clean local assembly graph --- include/denovo_discovery/denovo_discovery.h | 15 ++++++++------- include/denovo_discovery/discover_main.h | 1 + src/denovo_discovery/denovo_discovery.cpp | 13 ++++++++----- src/denovo_discovery/discover_main.cpp | 8 +++++++- src/denovo_discovery/local_assembly.cpp | 17 ++++++++--------- 5 files changed, 32 insertions(+), 22 deletions(-) diff --git a/include/denovo_discovery/denovo_discovery.h b/include/denovo_discovery/denovo_discovery.h index 32613954..fbaf49b1 100644 --- a/include/denovo_discovery/denovo_discovery.h +++ b/include/denovo_discovery/denovo_discovery.h @@ -13,23 +13,24 @@ namespace fs = boost::filesystem; class DenovoDiscovery { +private: + const uint16_t kmer_size; + const double read_error_rate; + public: - const uint16_t min_covg_for_node_in_assembly_graph; - bool clean_assembly_graph { false }; const uint16_t max_insertion_size; + const uint16_t min_covg_for_node_in_assembly_graph; + bool clean_assembly_graph; DenovoDiscovery(const uint16_t& kmer_size, const double& read_error_rate, - uint16_t max_insertion_size = 15, uint16_t min_covg_for_node_in_assembly_graph = 1); + uint16_t max_insertion_size = 15, + uint16_t min_covg_for_node_in_assembly_graph = 1, bool clean = false); void find_paths_through_candidate_region( CandidateRegion& candidate_region, const fs::path& denovo_output_directory); double calculate_kmer_coverage( const uint32_t& read_covg, const uint32_t& ref_length) const; - -private: - const uint16_t kmer_size; - const double read_error_rate; }; #endif // PANDORA_DENOVO_DISCOVERY_H diff --git a/include/denovo_discovery/discover_main.h b/include/denovo_discovery/discover_main.h index 408fcd42..e797a069 100644 --- a/include/denovo_discovery/discover_main.h +++ b/include/denovo_discovery/discover_main.h @@ -44,6 +44,7 @@ struct DiscoverOptions { uint32_t merge_dist { 22 }; uint32_t min_cluster_size { 10 }; uint32_t max_num_kmers_to_avg { 100 }; + bool clean_dbg { false }; }; void setup_discover_subcommand(CLI::App& app); diff --git a/src/denovo_discovery/denovo_discovery.cpp b/src/denovo_discovery/denovo_discovery.cpp index 049196e2..25160569 100644 --- a/src/denovo_discovery/denovo_discovery.cpp +++ b/src/denovo_discovery/denovo_discovery.cpp @@ -38,8 +38,10 @@ void DenovoDiscovery::find_paths_through_candidate_region( new BankStrings(candidate_region.pileup), "-kmer-size %d -abundance-min %d -verbose 0 -nb-cores 1 -out %s", kmer_size, min_covg_for_node_in_assembly_graph, GATB_graph_filepath_as_string.c_str()); - if (clean_assembly_graph) { + if (this->clean_assembly_graph) { + BOOST_LOG_TRIVIAL(debug) << "Cleaning local assembly graph..."; clean(gatb_graph); + BOOST_LOG_TRIVIAL(debug) << "Local assembly graph cleaned"; } graph = gatb_graph; // TODO: use move constructor @@ -134,11 +136,12 @@ void DenovoDiscovery::find_paths_through_candidate_region( DenovoDiscovery::DenovoDiscovery(const uint16_t& kmer_size, const double& read_error_rate, const uint16_t max_insertion_size, - const uint16_t min_covg_for_node_in_assembly_graph) - : min_covg_for_node_in_assembly_graph { min_covg_for_node_in_assembly_graph } - , max_insertion_size { max_insertion_size } - , kmer_size { kmer_size } + const uint16_t min_covg_for_node_in_assembly_graph, const bool clean) + : kmer_size { kmer_size } , read_error_rate { read_error_rate } + , max_insertion_size { max_insertion_size } + , min_covg_for_node_in_assembly_graph { min_covg_for_node_in_assembly_graph } + , clean_assembly_graph { clean } { } diff --git a/src/denovo_discovery/discover_main.cpp b/src/denovo_discovery/discover_main.cpp index b821c02d..f2c6788a 100644 --- a/src/denovo_discovery/discover_main.cpp +++ b/src/denovo_discovery/discover_main.cpp @@ -92,6 +92,11 @@ void setup_discover_subcommand(CLI::App& app) "--clean", opt->clean, "Add a step to clean and detangle the pangraph") ->group("Filtering"); + discover_subcmd + ->add_flag( + "--clean-dbg", opt->clean_dbg, "Clean the local assembly de Bruijn graph") + ->group("Filtering"); + discover_subcmd ->add_flag("--bin", opt->binomial, "Use binomial model for kmer coverages [default: negative binomial]") @@ -343,7 +348,8 @@ int pandora_discover(DiscoverOptions& opt) } DenovoDiscovery denovo { opt.denovo_kmer_size, opt.error_rate, - opt.max_insertion_size, opt.min_covg_for_node_in_assembly_graph }; + opt.max_insertion_size, opt.min_covg_for_node_in_assembly_graph, + opt.clean_dbg }; const fs::path denovo_output_directory { fs::path(opt.outdir) / "denovo_paths" }; fs::create_directories(denovo_output_directory); BOOST_LOG_TRIVIAL(info) diff --git a/src/denovo_discovery/local_assembly.cpp b/src/denovo_discovery/local_assembly.cpp index d763def3..600c7a25 100644 --- a/src/denovo_discovery/local_assembly.cpp +++ b/src/denovo_discovery/local_assembly.cpp @@ -259,15 +259,14 @@ void clean(Graph& graph, const uint16_t& num_cores) graph_simplifications._doBulgeRemoval = false; graph_simplifications._doECRemoval = false; - graph_simplifications._tipLen_Topo_kMult - = 2; // remove all tips of length <= k * X bp [default '2.500000'] set to 0 to - // turn off - graph_simplifications._tipLen_RCTC_kMult - = 0; // remove tips that pass coverage criteria, of length <= k * X bp [default - // '10.000000'] set to 0 to turn off - graph_simplifications._tipRCTCcutoff - = 2; // tip relative coverage coefficient: mean coverage of neighbors > X * tip - // coverage default 2.0 + // remove all tips of length <= k * X bp [default '2.500000'] set to 0 to turn off + graph_simplifications._tipLen_Topo_kMult = 2; + // remove tips that pass coverage criteria, of length <= k * X bp [default + // '10.000000'] set to 0 to turn off + graph_simplifications._tipLen_RCTC_kMult = 0; + // tip relative coverage coefficient: mean coverage of neighbors > X * tip coverage + // default 2.0 + graph_simplifications._tipRCTCcutoff = 2; graph_simplifications.simplify(); } From a3d0797b4708a75332e03226034cc4e992a542f5 Mon Sep 17 00:00:00 2001 From: Michael Hall Date: Tue, 27 Oct 2020 09:37:09 +1000 Subject: [PATCH 4/9] fix linking issues for gatb libraries --- CMakeLists.txt | 3 +++ ext/gatb.cmake | 7 ++++++- include/denovo_discovery/local_assembly.h | 4 ++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b541842..3a731a6f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,6 +46,7 @@ set(EXTERNAL_LIBS gatbcore hdf5 hdf5_tools) +link_directories(${gatb_binary_dir}/lib) include(${PROJECT_SOURCE_DIR}/ext/gtest.cmake) @@ -53,6 +54,8 @@ include(${PROJECT_SOURCE_DIR}/ext/gtest.cmake) include_directories(SYSTEM ${CMAKE_BINARY_DIR}/include ${PROJECT_SOURCE_DIR}/cgranges/cpp + ${gatb_source_dir}/gatb-core/src + ${gatb_binary_dir}/include ) #normal includes: warnings will be reported for these diff --git a/ext/gatb.cmake b/ext/gatb.cmake index f4a60c3f..e0b3f25c 100644 --- a/ext/gatb.cmake +++ b/ext/gatb.cmake @@ -9,7 +9,12 @@ ExternalProject_Add(gatb GIT_REPOSITORY https://github.com/GATB/gatb-core.git GIT_TAG "v1.4.1" PREFIX "${CMAKE_CURRENT_BINARY_DIR}/gatb" - SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/gatb/src/gatb/gatb-core/gatb-core" + SOURCE_SUBDIR gatb-core CMAKE_ARGS -DKSIZE_LIST=32 INSTALL_COMMAND "") +ExternalProject_Get_Property(gatb source_dir binary_dir) + +set(gatb_source_dir ${source_dir}) +set(gatb_binary_dir ${binary_dir}) + diff --git a/include/denovo_discovery/local_assembly.h b/include/denovo_discovery/local_assembly.h index 83f117ba..3883d8cd 100644 --- a/include/denovo_discovery/local_assembly.h +++ b/include/denovo_discovery/local_assembly.h @@ -8,8 +8,8 @@ #include #include -#include -#include +#include "gatb/debruijn/impl/Simplifications.hpp" +#include "gatb/gatb_core.hpp" #include #include From c1f764d50383e3ee7b4b96b4ae503e323a3ee320 Mon Sep 17 00:00:00 2001 From: Michael Hall Date: Wed, 28 Oct 2020 15:13:27 +1000 Subject: [PATCH 5/9] cleanup commented-out code and usage of cerr and cout --- src/de_bruijn/graph.cpp | 163 ++---------------------- src/denovo_discovery/local_assembly.cpp | 3 +- src/estimate_parameters.cpp | 7 +- src/index.cpp | 4 - src/inthash.cpp | 39 ++---- src/kmergraph.cpp | 20 +-- src/kmergraphwithcoverage.cpp | 92 +------------ src/localPRG.cpp | 60 --------- src/localgraph.cpp | 64 +--------- src/localnode.cpp | 13 +- src/minirecord.cpp | 3 - src/noise_filtering.cpp | 129 ++++--------------- src/pangenome/pangraph.cpp | 59 +-------- src/pangenome/panread.cpp | 59 --------- src/prg/path.cpp | 15 +-- src/seq.cpp | 2 - src/utils.cpp | 70 +++------- src/vcf.cpp | 19 --- 18 files changed, 83 insertions(+), 738 deletions(-) diff --git a/src/de_bruijn/graph.cpp b/src/de_bruijn/graph.cpp index 522af13a..9ca2bd2f 100644 --- a/src/de_bruijn/graph.cpp +++ b/src/de_bruijn/graph.cpp @@ -1,9 +1,6 @@ #include -#include -#include #include #include -#include #include #include #include @@ -13,7 +10,6 @@ #include "de_bruijn/graph.h" #include "noise_filtering.h" -#include "utils.h" #define assert_msg(x) !(std::cerr << "Assertion failed: " << x << std::endl) @@ -72,24 +68,11 @@ bool edge_is_valid(OrientedNodePtr from, OrientedNodePtr to) std::deque hashed_node_ids_to = to.first->hashed_node_ids; if (!from.second) { hashed_node_ids_from = rc_hashed_node_ids(hashed_node_ids_from); - // cout << "reverse from" << endl; } if (!to.second) { hashed_node_ids_to = rc_hashed_node_ids(hashed_node_ids_to); - // cout << "reverse to" << endl; - } - /*cout << from.second << to.second << " compare ("; - for (const auto &n : hashed_node_ids_from) - { - cout << n << " "; - } - cout << ") to ("; - for (const auto &n : hashed_node_ids_to) - { - cout << n << " "; } - cout << ")" << endl;*/ return overlap_forwards(hashed_node_ids_from, hashed_node_ids_to); } @@ -102,47 +85,26 @@ void debruijn::Graph::add_edge(OrientedNodePtr from, OrientedNodePtr to) or assert_msg( "edge from " << *from.first << " to " << *to.first << " is invalid")); - // uint8_t num_edges_added = 0; if (from.second and from.first->out_nodes.find(to.first->id) == from.first->out_nodes.end()) { from.first->out_nodes.insert(to.first->id); - // cout << "added edge " << from.first->id << " -> " << to.first->id << " so - // out_nodes.size() == " - // << from.first->out_nodes.size() << endl; - ////num_edges_added += 1; } else if (!from.second and from.first->in_nodes.find(to.first->id) == from.first->in_nodes.end()) { from.first->in_nodes.insert(to.first->id); - // cout << "added edge " << from.first->id << " <- " << to.first->id << " so - // in_nodes.size() == " - // << from.first->in_nodes.size() << endl; - ////num_edges_added += 1; } if (to.second and to.first->in_nodes.find(from.first->id) == to.first->in_nodes.end()) { to.first->in_nodes.insert(from.first->id); - // cout << "added edge " << to.first->id << " <- " << from.first->id << " so - // in_nodes.size() == " - // << to.first->in_nodes.size() << endl; - ////num_edges_added += 1; } else if (!to.second and to.first->out_nodes.find(from.first->id) == to.first->out_nodes.end()) { to.first->out_nodes.insert(from.first->id); - // cout << "added edge " << to.first->id << " -> " << from.first->id << " so - // out_nodes.size() == " - // << to.first->out_nodes.size() << endl; - ////num_edges_added += 1; } - - // assert(num_edges_added == 2 or assert_msg("did not add edge from " << *from << " - // to " << *to)); } // Remove all mentions of de bruijn node with id given from graph void debruijn::Graph::remove_node(const uint32_t dbg_node_id) { - // cout << "remove node " << dbg_node_id << endl; auto it = nodes.find(dbg_node_id); if (it != nodes.end()) { // remove this node from lists of out nodes from other graph nodes @@ -160,92 +122,36 @@ void debruijn::Graph::remove_node(const uint32_t dbg_node_id) } } -// remove unitig from dbg -/*void Graph::remove_unitig(const deque& tig) -{ - if (tig.size() < size) - { - return; - } - - // first remove nodes - unordered_set read_ids; - bool orientation = overlap_forwards(nodes[tig[0]]->hashed_node_ids, -nodes[tig[1]]->hashed_node_ids); for (uint i=1; iread_ids.begin(), nodes[tig[i]]->read_ids.end()); - orientation *= overlap_forwards(nodes[tig[i]]->hashed_node_ids, -nodes[tig[i+1]]->hashed_node_ids); remove_node[tig[i]]; - } - - // then add nodes and edges back to allow continuity - NodePtr last_node = nodes[tig[0]]; - NodePtr new_node; - for (uint i=1; i new_tig; - for (uint j=i; jhashed_node_ids[i]); - } - for (uint j=size-i; jhashed_node_ids[i]); - } else { - new_tig.push_back(rc_hashed_node_ids(nodes[tig.back()]->hashed_node_ids)[i]); - } - } - assert(new_tig.size() == size); - for(auto r : read_ids) - new_node = add_node(new_tig, r); - add_edge(last_node, new_node); - last_node = new_node; - } - add_edge(last_node, nodes[tig.back()]); -}*/ - // Remove all copies of read from de bruijn node void debruijn::Graph::remove_read_from_node( const uint32_t read_id, const uint32_t dbg_node_id) { - // cout << "remove read " << (uint)read_id << " from node " << (uint)dbg_node_id << - // endl; auto it = nodes.find(dbg_node_id); bool found_read_intersect; if (it != nodes.end()) { - // cout << "found node " << *(it->second) << endl; auto rit = it->second->read_ids.find(read_id); if (rit != it->second->read_ids.end()) { - // cout << "found read id on node" << endl; it->second->read_ids.erase(rit); - // cout << "removed read id" << endl; // if there are no more reads covering it, remove the node - if (it->second->read_ids.empty()) { - // cout << "no more reads, so remove node" << endl; remove_node(dbg_node_id); } else { // otherwise, remove any outnodes which no longer share a read - // cout << "remove outnodes which no longer share a read" << endl; for (std::unordered_set::iterator nit = it->second->out_nodes.begin(); nit != it->second->out_nodes.end();) { - // cout << "out node " << *nit << endl; found_read_intersect = false; for (const auto& r : it->second->read_ids) { if (nodes[*nit]->read_ids.find(r) != nodes[*nit]->read_ids.end()) { found_read_intersect = true; - // cout << " shares a read" << endl; break; } } if (!found_read_intersect) { - // cout << " does not share a read" << endl; nodes[*nit]->in_nodes.erase(dbg_node_id); nit = it->second->out_nodes.erase(nit); - // cout << "removed" << endl; } else { nit++; } @@ -253,21 +159,17 @@ void debruijn::Graph::remove_read_from_node( for (std::unordered_set::iterator nit = it->second->in_nodes.begin(); nit != it->second->in_nodes.end();) { - // cout << "out node " << *nit << endl; found_read_intersect = false; for (const auto& r : it->second->read_ids) { if (nodes[*nit]->read_ids.find(r) != nodes[*nit]->read_ids.end()) { found_read_intersect = true; - // cout << " shares a read" << endl; break; } } if (!found_read_intersect) { - // cout << " does not share a read" << endl; nodes[*nit]->out_nodes.erase(dbg_node_id); nit = it->second->in_nodes.erase(nit); - // cout << "removed" << endl; } else { nit++; } @@ -330,27 +232,18 @@ void debruijn::Graph::extend_unitig(std::deque& tig) and (nodes[tig.back()]->out_nodes.size() + nodes[tig.back()]->in_nodes.size()) == 0); if (tig_is_empty or node_is_isolated) { - // cout << "node is isolated or tig empty" << endl; return; } - bool can_extend = nodes[tig.back()]->out_nodes.size() - == 1; // and nodes[tig.back()]->in_nodes.size() <= 1; + bool can_extend = nodes[tig.back()]->out_nodes.size() == 1; bool use_outnodes = true; while (can_extend) { - /*cout << "tig in progress before b: "; - for (const auto &n : tig) { - cout << n << " "; - } - cout << endl;*/ - - if (use_outnodes) // and find(tig.begin(), tig.end(), - // *nodes[tig.back()]->out_nodes.begin())!=tig.end()) + if (use_outnodes) { tig.push_back(*nodes[tig.back()]->out_nodes.begin()); - else // if (find(tig.begin(), tig.end(), - // *nodes[tig.back()]->in_nodes.begin())!=tig.end()) + } else { tig.push_back(*nodes[tig.back()]->in_nodes.begin()); + } if (std::find(nodes[tig.back()]->in_nodes.begin(), nodes[tig.back()]->in_nodes.end(), *----tig.end()) @@ -359,7 +252,6 @@ void debruijn::Graph::extend_unitig(std::deque& tig) and nodes[tig.back()]->in_nodes.size() <= 1 and tig.front() != tig.back(); use_outnodes = true; - // cout << "A"; } else if (std::find(nodes[tig.back()]->out_nodes.begin(), nodes[tig.back()]->out_nodes.end(), *----tig.end()) != nodes[tig.back()]->out_nodes.end()) { @@ -367,24 +259,15 @@ void debruijn::Graph::extend_unitig(std::deque& tig) and nodes[tig.back()]->out_nodes.size() <= 1 and tig.front() != tig.back(); use_outnodes = false; - // cout << "B"; } else { can_extend = false; - // cout << "C"; } - - /*cout << "tig in progress b: "; - for (const auto &n : tig) { - cout << n << " "; - } - cout << endl;*/ } if (tig.size() == 1) { can_extend = nodes[tig.front()]->in_nodes.size() == 1 and nodes[tig.front()]->out_nodes.size() <= 1; use_outnodes = false; - // cout << "D"; } else { if (std::find(nodes[tig.front()]->in_nodes.begin(), @@ -394,7 +277,6 @@ void debruijn::Graph::extend_unitig(std::deque& tig) and nodes[tig.front()]->in_nodes.size() <= 1 and tig.front() != tig.back(); use_outnodes = true; - // cout << "E"; } else if (std::find(nodes[tig.front()]->out_nodes.begin(), nodes[tig.front()]->out_nodes.end(), *++tig.begin()) != nodes[tig.front()]->out_nodes.end()) { @@ -402,25 +284,17 @@ void debruijn::Graph::extend_unitig(std::deque& tig) and nodes[tig.front()]->out_nodes.size() <= 1 and tig.front() != tig.back(); use_outnodes = false; - // cout << "F"; } else { can_extend = false; } } while (can_extend) { - /*cout << "tig in progress before f: "; - for (const auto &n : tig) { - cout << n << " "; - } - cout << endl;*/ - - if (use_outnodes) // and find(tig.begin(), tig.end(), - // *nodes[tig.front()]->out_nodes.begin())!=tig.end()) + if (use_outnodes) { tig.push_front(*nodes[tig.front()]->out_nodes.begin()); - else // if (find(tig.begin(), tig.end(), - // *nodes[tig.front()]->in_nodes.begin())!=tig.end()) + } else { tig.push_front(*nodes[tig.front()]->in_nodes.begin()); + } if (std::find(nodes[tig.front()]->in_nodes.begin(), nodes[tig.front()]->in_nodes.end(), *++tig.begin()) @@ -429,7 +303,6 @@ void debruijn::Graph::extend_unitig(std::deque& tig) and nodes[tig.front()]->in_nodes.size() <= 1 and tig.front() != tig.back(); use_outnodes = true; - // cout << "G"; } else if (std::find(nodes[tig.front()]->out_nodes.begin(), nodes[tig.front()]->out_nodes.end(), *++tig.begin()) != nodes[tig.front()]->out_nodes.end()) { @@ -437,24 +310,14 @@ void debruijn::Graph::extend_unitig(std::deque& tig) and nodes[tig.front()]->out_nodes.size() <= 1 and tig.front() != tig.back(); use_outnodes = false; - // cout << "H"; } else { can_extend = false; - // cout << "I"; - } - - /*cout << "tig in progress f: "; - for (const auto &n : tig) - { - cout << n << " "; } - cout << endl;*/ } - while (tig.size() > 1 - and tig.front() - == tig.back()) // find(tig.begin(), --tig.end(), tig.back()) == tig.end()) + while (tig.size() > 1 and tig.front() == tig.back()) { tig.pop_back(); + } std::stringstream tig_ss; for (const auto& n : tig) @@ -470,8 +333,6 @@ bool debruijn::Graph::found_in_out_nodes( for (const auto& i : node_ptr_to_search->out_nodes) { if (*nodes.at(i) == *node_ptr_to_find) { return true; - } else { - std::cout << *nodes.at(i) << " != " << *node_ptr_to_find << std::endl; } } return false; @@ -484,8 +345,6 @@ bool debruijn::Graph::found_in_in_nodes( for (const auto& i : node_ptr_to_search->in_nodes) { if (*nodes.at(i) == *node_ptr_to_find) { return true; - } else { - std::cout << *nodes.at(i) << " != " << *node_ptr_to_find << std::endl; } } return false; @@ -506,9 +365,6 @@ bool debruijn::Graph::operator==(const Graph& y) const bool found = false; for (const auto& s : y.nodes) { if (*t.second == *s.second) { - // cout << t.first << " " << *t.second << " == " << *s.second << " " << - // s.first << endl; - found = true; // also check the outnodes are the same @@ -545,7 +401,6 @@ bool debruijn::Graph::operator==(const Graph& y) const } // nodes can't be equal within a graph so don't need to check vice versa - // cout << "found all nodes" << endl; return true; } diff --git a/src/denovo_discovery/local_assembly.cpp b/src/denovo_discovery/local_assembly.cpp index 600c7a25..003e8611 100644 --- a/src/denovo_discovery/local_assembly.cpp +++ b/src/denovo_discovery/local_assembly.cpp @@ -180,6 +180,8 @@ std::pair LocalAssemblyGraph::get_paths_between( abandoned = true; break; } + + BOOST_LOG_TRIVIAL(trace) << "Trying local assembly with " << std::to_string(required_percent_of_expected_covg) << " * "; build_paths_between(start_kmer, end_kmer, path_accumulator, tree, node_to_distance_to_the_end_node, paths_between_queries, max_path_length, expected_coverage, required_percent_of_expected_covg); @@ -224,7 +226,6 @@ void LocalAssemblyGraph::build_paths_between(const std::string& start_kmer, return; } } - path_accumulator.push_back(start_kmer.back()); if (string_ends_with(path_accumulator, end_kmer) diff --git a/src/estimate_parameters.cpp b/src/estimate_parameters.cpp index dcd1cf1e..250fb06a 100644 --- a/src/estimate_parameters.cpp +++ b/src/estimate_parameters.cpp @@ -1,6 +1,5 @@ #include #include -#include #include #include #include @@ -250,14 +249,14 @@ uint32_t estimate_parameters(std::shared_ptr pangraph, // evaluate error rate auto mean = fit_mean_covg(kmer_covg_dist, covg / 10); auto var = fit_variance_covg(kmer_covg_dist, mean, covg / 10); - std::cout << "mean, var: " << mean << " " << var << std::endl; + BOOST_LOG_TRIVIAL(debug) << "mean, var: " << mean << ", " << var; if (mean > var) { auto zero_thresh = 2; mean = fit_mean_covg(kmer_covg_dist, zero_thresh); var = fit_variance_covg(kmer_covg_dist, mean, zero_thresh); - std::cout << "new mean, var: " << mean << " " << var << std::endl; + BOOST_LOG_TRIVIAL(debug) << "new mean, var: " << mean << ", " << var; } - std::cout << bin << " " << num_reads << " " << covg << std::endl; + BOOST_LOG_TRIVIAL(debug) << "nb reads, covg" << num_reads << ", " << covg; if ((bin and num_reads > 30 and covg > 30) or (not bin and abs(var - mean) < 2 and mean > 10 and num_reads > 30 and covg > 2)) { diff --git a/src/index.cpp b/src/index.cpp index bb379457..cb8ae97e 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -22,8 +22,6 @@ void Index::add_record(const uint64_t kmer, const uint32_t prg_id, const prg::Path& path, const uint32_t knode_id, const bool strand) { - // cout << "Add kmer " << kmer << " id, path, strand " << prg_id << ", " << path << - // ", " << strand << endl; auto it = minhash.find(kmer); // checks if kmer is in minhash if (it == minhash.end()) { // no auto* newv = new std::vector; // get a new vector of MiniRecords - @@ -32,7 +30,6 @@ void Index::add_record(const uint64_t kmer, const uint32_t prg_id, newv->emplace_back(MiniRecord(prg_id, path, knode_id, strand)); minhash.insert(std::pair*>( kmer, newv)); // add this record to minhash - // cout << "New minhash size: " << minhash.size() << endl; } else { // yes MiniRecord mr(prg_id, path, knode_id, strand); // create a new MiniRecord from this minimizer kmer @@ -41,7 +38,6 @@ void Index::add_record(const uint64_t kmer, const uint32_t prg_id, // vector of this minimizer it->second->push_back(mr); // no, add it } - // cout << "New minhash entry for kmer " << kmer << endl; } } diff --git a/src/inthash.cpp b/src/inthash.cpp index 07b49f52..b659cc6e 100644 --- a/src/inthash.cpp +++ b/src/inthash.cpp @@ -1,6 +1,5 @@ #include #include -#include #include #include "inthash.h" @@ -38,24 +37,16 @@ * means hash_64(x, mask)==hash_64(y, mask) if and only if x==y. */ -unsigned char seq_nt4_table[256] = { - 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 -}; +unsigned char seq_nt4_table[256] = { 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, + 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; uint32_t nt4(char c) { return seq_nt4_table[(uint8_t)c]; } @@ -138,27 +129,17 @@ std::pair KmerHash::kmerhash(const std::string& s, const uin assert(s.size() == k); int c; uint64_t shift1 = 2 * (k - 1), mask = (1ULL << 2 * k) - 1, kmer[2] = { 0, 0 }; - // uint64_t mask1 = pow(4,k) - 1, kh = 0, rckh = 0; char myArray[s.size() + 1]; // as 1 char space for null is also required strcpy(myArray, s.c_str()); for (char i : s) { c = seq_nt4_table[(uint8_t)i]; - // cout << s[i] << " -> " << c; if (c < 4) { // not an ambiguous base kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer kmer[1] = (kmer[1] >> 2) | (3ULL ^ c) << shift1; // reverse k-mer - // kh += c*pow(4,i); - // rckh += (3-c)*pow(4,k-i-1); } - // cout << "after adding " << s[i] << "=" << c << " kh is now " << kh << " and - // kmer[0] is now " << kmer[0] << ", rckh is now " << rckh << " and kmer[1] is - // now " << kmer[1] << endl; } - // cout << "s: " << s << " -> " << kh << endl; kmer[0] = hash64(kmer[0], mask); kmer[1] = hash64(kmer[1], mask); - // cout << "kmer " << s << " has kmer[0] " << kmer[0] << " and kmer[1] " << kmer[1] - // << endl; auto ret = std::make_pair(kmer[0], kmer[1]); lookup[s] = ret; diff --git a/src/kmergraph.cpp b/src/kmergraph.cpp index e17dd125..bcca5b68 100644 --- a/src/kmergraph.cpp +++ b/src/kmergraph.cpp @@ -1,17 +1,13 @@ #include -#include #include #include -#include /* NULL */ #include /* srand, rand */ -#include #include #include "utils.h" #include "kmernode.h" #include "kmergraph.h" -#include "kmergraphwithcoverage.h" #include "localPRG.h" #define assert_msg(x) !(std::cerr << "Assertion failed: " << x << std::endl) @@ -109,7 +105,6 @@ KmerNodePtr KmerGraph::add_node(const prg::Path& p) KmerNodePtr n(std::make_shared(nodes.size(), p)); // create the node nodes.push_back(n); // add it to nodes sorted_nodes.insert(n); - // nodes[next_id] = make_shared(next_id, p); assert(k == 0 or p.length() == 0 or p.length() == k); if (k == 0 and p.length() > 0) { k = p.length(); @@ -150,7 +145,6 @@ void KmerGraph::add_edge(KmerNodePtr from, KmerNodePtr to) if (from->find_node_ptr_in_out_nodes(to) == from->out_nodes.end()) { from->out_nodes.emplace_back(to); to->in_nodes.emplace_back(from); - // cout << "added edge from " << from->id << " to " << to->id << endl; } } @@ -163,7 +157,6 @@ void KmerGraph::remove_shortcut_edges() std::deque> d; for (const auto& n : nodes) { - // cout << n.first << endl; for (const auto& out : n->out_nodes) { auto out_node_as_shared_ptr = out.lock(); for (auto nextOut = out_node_as_shared_ptr->out_nodes.begin(); @@ -262,8 +255,7 @@ void KmerGraph::save( // TODO: leave as coverage 0 or change this? handle << "\tFC:i:" << 0 << "\t" - << "\tRC:i:" << 0 - << std::endl; //"\t" << (unsigned)nodes[i].second->num_AT << endl; + << "\tRC:i:" << 0 << std::endl; for (uint32_t j = 0; j < c->out_nodes.size(); ++j) { handle << "L\t" << c->id << "\t+\t" << c->out_nodes[j].lock()->id @@ -272,7 +264,7 @@ void KmerGraph::save( } handle.close(); } else { - std::cerr << "Unable to open kmergraph file " << filepath << std::endl; + BOOST_LOG_TRIVIAL(error) << "Unable to open kmergraph file " << filepath; std::exit(EXIT_FAILURE); } } @@ -379,12 +371,10 @@ void KmerGraph::load(const std::string& filepath) to = std::stoi(split_line[1]); } add_edge(nodes[from], nodes[to]); - // nodes[from]->outNodes.push_back(nodes.at(to)); - // nodes[to]->inNodes.push_back(nodes.at(from)); } } } else { - std::cerr << "Unable to open kmergraph file " << filepath << std::endl; + BOOST_LOG_TRIVIAL(error) << "Unable to open kmergraph file " << filepath; exit(1); } } @@ -421,8 +411,7 @@ uint32_t KmerGraph::min_path_length() bool KmerGraph::operator==(const KmerGraph& other_graph) const { // false if have different numbers of nodes - if (other_graph.nodes.size() - != nodes.size()) { // cout << "different numbers of nodes" << endl; + if (other_graph.nodes.size() != nodes.size()) { return false; } @@ -450,7 +439,6 @@ bool KmerGraph::operator==(const KmerGraph& other_graph) const } } } - // otherwise is true return true; } diff --git a/src/kmergraphwithcoverage.cpp b/src/kmergraphwithcoverage.cpp index b77abc31..ae4bc90d 100644 --- a/src/kmergraphwithcoverage.cpp +++ b/src/kmergraphwithcoverage.cpp @@ -1,8 +1,6 @@ #include -#include #include #include -#include /* NULL */ #include /* srand, rand */ #include #include @@ -31,7 +29,6 @@ void KmerGraphWithCoverage::set_binomial_parameter_p(const float e_rate) assert(kmer_prg->k != 0); assert(0 < e_rate and e_rate < 1); binomial_parameter_p = 1 / exp(e_rate * kmer_prg->k); - // cout << "using p: " << p << endl; } void KmerGraphWithCoverage::increment_covg( @@ -82,7 +79,6 @@ void KmerGraphWithCoverage::set_negative_binomial_parameters( { if (nbin_prob == 0 and nb_fail == 0) return; - // qcout << "set nb" << endl; assert((negative_binomial_parameter_p > 0 and negative_binomial_parameter_p < 1) || assert_msg( "nb_p " << negative_binomial_parameter_p << " was not set in kmergraph")); @@ -279,47 +275,6 @@ float KmerGraphWithCoverage::find_max_path(std::vector& maxpath, return prob_path(maxpath, sample_id, prob_model); } -/* -std::vector> KmerGraph::find_max_paths(uint32_t num, - const uint32_t -&sample_id) { - - // save original coverges so can put back at the end - std::vector original_covgs0, original_covgs1; - for (uint32_t i = 0; i != nodes.size(); ++i) { - original_covgs0.push_back(nodes[i]->get_covg(0, sample_id)); - original_covgs1.push_back(nodes[i]->get_covg(1, sample_id)); - } - - // find num max paths - //cout << "expected covg " << (uint)(p*num_reads/num) << endl; - std::vector> paths; - std::vector maxpath; - find_max_path(maxpath, <#initializer#>); - //uint min_covg; - paths.push_back(maxpath); - - while (paths.size() < num) { - for (uint32_t i = 0; i != maxpath.size(); ++i) { - maxpath[i]->covg[0] -= std::min(maxpath[i]->covg[0], (uint32_t) (p * -num_reads / num)); maxpath[i]->covg[1] -= std::min(maxpath[i]->covg[1], (uint32_t) (p * -num_reads / num)); - } - maxpath.clear(); - find_max_path(maxpath, <#initializer#>); - paths.push_back(maxpath); - } - - // put covgs back - for (uint32_t i = 0; i != nodes.size(); ++i) { - nodes[i]->covg[0] = original_covgs0[i]; - nodes[i]->covg[1] = original_covgs1[i]; - } - - return paths; -} - */ - std::vector> KmerGraphWithCoverage::get_random_paths( uint32_t num_paths) { @@ -373,44 +328,6 @@ float KmerGraphWithCoverage::prob_path(const std::vector& kpath, return return_prob_path / len; } -/* -float KmerGraph::prob_paths(const std::vector> &kpaths) { - if (kpaths.empty()) { - return 0; // is this the correct default? - } - - // collect how much coverage we expect on each node from these paths - std::vector path_node_covg(nodes.size(), 0); - for (uint32_t i = 0; i != kpaths.size(); ++i) { - for (uint32_t j = 0; j != kpaths[i].size(); ++j) { - path_node_covg[kpaths[i][j]->id] += 1; - } - } - - // now calculate max likelihood assuming independent paths - float ret_p = 0; - uint32_t len = 0; - for (uint32_t i = 0; i != path_node_covg.size(); ++i) { - if (path_node_covg[i] > 0) { - //cout << "prob of node " << nodes[i]->id << " which has path covg " << -path_node_covg[i] << " and so we expect to see " << -num_reads*path_node_covg[i]/kpaths.size() << " times IS " << prob(nodes[i]->id, -num_reads*path_node_covg[i]/kpaths.size()) << endl; ret_p += prob(nodes[i]->id, -num_reads * path_node_covg[i] / kpaths.size(), <#initializer#>); if -(nodes[i]->path.length() > 0) { len += 1; - } - } - } - - if (len == 0) { - len = 1; - } - - //cout << "len " << len << endl; - return ret_p / len; -} -*/ - void KmerGraphWithCoverage::save_covg_dist(const std::string& filepath) { std::ofstream handle; @@ -452,8 +369,7 @@ void KmerGraphWithCoverage::save( } handle << "\tFC:i:" << get_covg(c->id, 0, sample_id) << "\t" - << "\tRC:i:" << get_covg(c->id, 1, sample_id) - << std::endl; //"\t" << (unsigned)nodes[i].second->num_AT << endl; + << "\tRC:i:" << get_covg(c->id, 1, sample_id) << std::endl; for (uint32_t j = 0; j < c->out_nodes.size(); ++j) { handle << "L\t" << c->id << "\t+\t" << c->out_nodes[j].lock()->id @@ -462,7 +378,7 @@ void KmerGraphWithCoverage::save( } handle.close(); } else { - std::cerr << "Unable to open kmergraph file " << filepath << std::endl; + BOOST_LOG_TRIVIAL(error) << "Unable to open kmergraph file " << filepath; std::exit(EXIT_FAILURE); } } @@ -578,12 +494,10 @@ void KmerGraphWithCoverage::load(const std::string& filepath) to = std::stoi(split_line[1]); } kmer_prg->add_edge(kmer_prg->nodes[from], kmer_prg->nodes[to]); - // nodes[from]->out_nodes.push_back(nodes.at(to)); - // nodes[to]->in_nodes.push_back(nodes.at(from)); } } } else { - std::cerr << "Unable to open kmergraph file " << filepath << std::endl; + BOOST_LOG_TRIVIAL(error) << "Unable to open kmergraph file " << filepath; exit(1); } } \ No newline at end of file diff --git a/src/localPRG.cpp b/src/localPRG.cpp index 78c4c7f1..12f58c6e 100644 --- a/src/localPRG.cpp +++ b/src/localPRG.cpp @@ -1,9 +1,7 @@ #include #include #include -#include #include -#include #include #include @@ -47,7 +45,6 @@ bool LocalPRG::isalpha_string(const std::string& s) const // Returns if a string s is entirely alphabetic for (char j : s) if (isalpha(j) == 0) { - // std::cout << "Found non-alpha char: " << s[j] << std::endl; return false; } return true; @@ -55,15 +52,12 @@ bool LocalPRG::isalpha_string(const std::string& s) const std::string LocalPRG::string_along_path(const prg::Path& p) const { - // std::cout << p << std::endl; assert(p.get_start() <= seq.length()); assert(p.get_end() <= seq.length()); std::string s; for (const auto& it : p) { s += seq.substr(it.start, it.length); - // std::cout << s << std::endl; } - // std::cout << "lengths: " << s.length() << ", " << p.length << std::endl; assert(s.length() == p.length() || assert_msg("sequence length " << s.length() << " is not equal to path length " << p.length())); @@ -174,9 +168,6 @@ std::vector LocalPRG::nodes_along_path_core(const prg::Path& p) co std::vector LocalPRG::split_by_site(const Interval& i) const { // Splits interval by next_site based on substring of seq in the interval - // std::cout << "splitting by site " << next_site << " in interval " << i << - // std::endl; - // Split first by var site std::vector v; // contains the intervals split by the variant site v.reserve(4); @@ -384,7 +375,6 @@ std::vector LocalPRG::shift(prg::Path p) const std::deque short_paths = { std::make_shared(q) }; std::vector k_paths; bool non_terminus; - // uint exp_num_return_seqs = 0; // first find extensions of the path while (!short_paths.empty()) { @@ -654,7 +644,6 @@ void LocalPRG::minimizer_sketch(const std::shared_ptr& index, const uint3 kmer = string_along_path(*(v[j])); kh = hash.kmerhash(kmer, k); smallest = std::min(smallest, std::min(kh.first, kh.second)); - // std::cout << min(kh.first, kh.second) << " "; } for (uint32_t j = 0; j != w; j++) { kmer = string_along_path(*(v[j])); @@ -850,18 +839,13 @@ std::vector LocalPRG::localnode_path_from_kmernode_path( } } if (localnode_path[0]->id != 0) { - // std::cout << "localnode path still does not start with 0" << std::endl; // add the first path to start LocalNodePtr next = nullptr; while (localnode_path[0]->id != 0 and next != localnode_path[0]) { - // std::cout << "look for a previous node to " << *localnode_path[0] << - // std::endl; next = prg.get_previous_node(localnode_path[0]); if (next != nullptr) { - // std::cout << "add " << *next << std::endl; localnode_path.insert(localnode_path.begin(), next); } - // std::cout << "new start node is " << *localnode_path[0] << std::endl; } } } @@ -899,14 +883,10 @@ std::vector LocalPRG::localnode_path_from_kmernode_path( } } if (localnode_path.back()->id != prg.nodes.size() - 1) { - // std::cout << "localnode path still does not end with " << - // prg.nodes.size()-1 << std::endl; // add the first path to end while (localnode_path.back()->id != prg.nodes.size() - 1 and !localnode_path.back()->outNodes.empty()) { - // std::cout << "extend " << *localnode_path.back(); localnode_path.push_back(localnode_path.back()->outNodes[0]); - // std::cout << " with " << *localnode_path.back() << std::endl; } } } @@ -1188,17 +1168,6 @@ void LocalPRG:: const std::string& sample_name) const { BOOST_LOG_TRIVIAL(debug) << "Update VCF with sample path"; - /*for (uint i=0; i!=sample_path.size(); ++i) - { - std::cout << *sample_path[i] << " "; - } - std::cout << std::endl; - std::cout << now() << "Using ref path" << std::endl; - for (uint i=0; i!=rpath.size(); ++i) - { - std::cout << *rpath[i] << " "; - } - std::cout << std::endl;*/ assert(!prg.nodes.empty()); // otherwise empty nodes -> segfault // if prg has only one node, simple case @@ -1231,27 +1200,19 @@ void LocalPRG:: found_new_site = true; sample_id++; } else if (found_new_site) { - // refpath back == samplepath back // add ref allele from previous site to this one - // std::cout << "update with ref alleles from " << pos << " to " << pos_to - // << std::endl; vcf.set_sample_gt_to_ref_allele_for_records_in_the_interval( sample_name, name, pos, pos_to); pos = pos_to; // add new site to vcf - // std::cout << "find ref seq" << std::endl; for (uint32_t j = 1; j < refpath.size() - 1; ++j) { ref += refpath[j]->seq; - // std::cout << ref << std::endl; } - // std::cout << "find alt seq" << std::endl; for (uint32_t j = 1; j < samplepath.size() - 1; ++j) { alt += samplepath[j]->seq; - // std::cout << alt << std::endl; } - // std::cout << "add sample gt" << std::endl; vcf.add_a_new_record_discovered_in_a_sample_and_genotype_it( sample_name, name, pos, ref, alt); found_new_site = false; @@ -1276,7 +1237,6 @@ void LocalPRG:: } pos_to = pos; } else { - // std::cout << pos_to << std::endl; refpath.erase(refpath.begin(), refpath.end() - 1); if (refpath.back()->id != prg.nodes.size() - 1) { ref = ""; @@ -1293,10 +1253,8 @@ void LocalPRG:: } } } - // std::cout << "add last ref alleles" << std::endl; vcf.set_sample_gt_to_ref_allele_for_records_in_the_interval( sample_name, name, pos, pos_to); - // std::cout << "done" << std::endl; } // Find the path through the PRG which deviates at pos from the ref path with alt @@ -1305,8 +1263,6 @@ std::vector LocalPRG::find_alt_path( const std::vector& ref_path, const uint32_t pos, const std::string& ref, const std::string& alt) const { - // std::cout << now() << "Find alt path for PRG " << name << " variant " << pos << " - // " << ref << " " << alt << std::endl; std::vector alt_path, considered_path; std::deque> paths_in_progress; uint32_t ref_added = 0, pos_along_ref_path = 0; @@ -1327,8 +1283,6 @@ std::vector LocalPRG::find_alt_path( break; } } - // std::cout << "pos " << pos << " pos_along_ref_path " << pos_along_ref_path << " - // ref_path.size() " << ref_path.size() << std::endl; // find the localnodeptr we want to make our way back to while (pos_along_ref_path < ref_path.size() - 1 @@ -1339,7 +1293,6 @@ std::vector LocalPRG::find_alt_path( } assert(pos_along_ref_path < ref_path.size()); auto ref_node_to_find = ref_path[pos_along_ref_path]; - // std::cout << "trying to find " << *ref_node_to_find; // find an alt path with the required sequence if (alt_path.empty() and not ref_path.empty() and ref_path[0]->pos.length == 0) @@ -1353,14 +1306,7 @@ std::vector LocalPRG::find_alt_path( considered_path = paths_in_progress.front(); paths_in_progress.pop_front(); - /*std::cout << "consider path "; - for (const auto &t : considered_path){ - std::cout << t->pos << " "; - } - std::cout << std::endl;*/ - auto considered_seq = string_along_path(considered_path); - // std::cout << "considered_seq " << considered_seq << std::endl; if (considered_seq == working_alt) { // check if merge with ref path @@ -1371,11 +1317,6 @@ std::vector LocalPRG::find_alt_path( alt_path.end(), considered_path.begin(), considered_path.end()); alt_path.insert(alt_path.end(), ref_path.begin() + pos_along_ref_path, ref_path.end()); - /*std::cout << "found alt path "; - for (const auto &t : considered_path){ - std::cout << t->pos << " "; - } - std::cout << std::endl;*/ return alt_path; } else { for (const auto& m : considered_path.back()->outNodes) { @@ -1544,7 +1485,6 @@ void LocalPRG::add_sample_covgs_to_vcf(VCF& vcf, const KmerGraphWithCoverage& kg for (auto& recordPointer : vcf.get_records()) { auto& record = *recordPointer; - // std::cout << record << std::endl; // find corresponding ref kmers auto end_pos = record.get_ref_end_pos(); diff --git a/src/localgraph.cpp b/src/localgraph.cpp index 5f8e0eee..2ce6de6f 100644 --- a/src/localgraph.cpp +++ b/src/localgraph.cpp @@ -9,19 +9,9 @@ #define assert_msg(x) !(std::cerr << "Assertion failed: " << x << std::endl) -LocalGraph::LocalGraph() -{ - // reserve space in index - // index.reserve(10); -} +LocalGraph::LocalGraph() { } -LocalGraph::~LocalGraph() -{ - /*for (const auto &c: nodes) { - delete c.second; - }*/ - nodes.clear(); -} +LocalGraph::~LocalGraph() { nodes.clear(); } // add the node with a given id and seq if the id is not already in the nodes void LocalGraph::add_node( const uint32_t& id, const std::string& seq, const Interval& pos) @@ -33,9 +23,6 @@ void LocalGraph::add_node( if (it == nodes.end()) { LocalNodePtr n(std::make_shared(seq, pos, id)); nodes[id] = n; // add the node to the map - // nodes[id] = make_shared(seq, pos, id); - // cout << "Added node " << id << endl; - // add the node to the interval indexes for fast overlap queries if (pos.length == 0) startIndexOfZeroLengthIntervals[pos.start] = n; @@ -60,25 +47,9 @@ void LocalGraph::add_edge(const uint32_t& from, const uint32_t& to) << ">" << t->pos.start << " so cannot add edge from node " << *f << " to node " << *t)); f->outNodes.push_back(t); - // cout << "Added edge (" << f->id << ", " << t->id << ")" << endl; } } -/*void LocalGraph::add_varsite (const uint16_t level, const uint32_t pre_site_id, const -uint32_t post_site_id) -{ - assert(pre_site_id <= post_site_id); - while (level >= index.size()) - { - vector> levelv; - levelv.reserve(400); - //levelv = {}; - index.insert(index.end(), 1, levelv); - } - index[level].push_back(make_pair(pre_site_id, post_site_id)); - return; -}*/ - void LocalGraph::write_gfa(const std::string& filepath) const { std::ofstream handle; @@ -140,7 +111,7 @@ void LocalGraph::read_gfa(const std::string& filepath) } } } else { - std::cerr << "Unable to open GFA file " << filepath << std::endl; + BOOST_LOG_TRIVIAL(error) << "Unable to open GFA file " << filepath; std::exit(1); } } @@ -149,8 +120,6 @@ std::vector LocalGraph::walk( const uint32_t& node_id, const uint32_t& pos, const uint32_t& len) const { // node_id: where to start the walk, pos: the position in the node_id, len = k+w-1 -> // the length that the walk has to go through - we are sketching kmers in a graph - // cout << "walking graph from node " << node_id << " pos " << pos << " for length " - // << len << endl; // walks from position pos in node node for length len bases assert( (nodes.at(node_id)->pos.start <= pos && nodes.at(node_id)->pos.get_end() >= pos) @@ -163,35 +132,25 @@ std::vector LocalGraph::walk( prg::Path p, p2; std::deque d; - // cout << "pos+len: " << pos+len << " nodes.at(node_id)->pos.get_end(): " << - // nodes.at(node_id)->pos.get_end() << endl; if (pos + len <= nodes.at(node_id) ->pos.get_end()) { // checks if we can go until the end of the kmer p.initialize(Interval( pos, pos + len)); // create the path containing this interval of the node - // cout << "return path: " << p << endl; return_paths.push_back(std::make_shared(p)); - // cout << "return_paths size: " << return_paths.size() << endl; return return_paths; } uint32_t len_added = std::min(nodes.at(node_id)->pos.get_end() - pos, len); - // cout << "len: " << len << " len_added: " << len_added << endl; if (len_added < len) { for (auto it = nodes.at(node_id)->outNodes.begin(); it != nodes.at(node_id)->outNodes.end(); ++it) { - // cout << "Following node: " << (*it)->id << " to add " << len-len_added << - // " more bases" << endl; walk_paths = walk((*it)->id, (*it)->pos.start, len - len_added); - // cout << "walk paths size: " << walk_paths.size() << endl; for (auto& walk_path : walk_paths) { // Note, would have just added start interval to each item in // walk_paths, but can't seem to force result of it2 to be non-const - // cout << (*it2) << endl; p2.initialize(Interval(pos, nodes.at(node_id)->pos.get_end())); p2.insert_to_the_end(walk_path->begin(), walk_path->end()); - // cout << "path: " << p2 << " p2.length: " << p2.length << endl; if (p2.length() == len) { return_paths.push_back(std::make_shared(p2)); } @@ -204,8 +163,6 @@ std::vector LocalGraph::walk( std::vector LocalGraph::walk_back( const uint32_t& node_id, const uint32_t& pos, const uint32_t& len) const { - // cout << "start walking back from " << pos << " in node " << node_id << " for - // length " << len << endl; // walks from position pos in node back through prg for length len bases assert( (nodes.at(node_id)->pos.start <= pos && nodes.at(node_id)->pos.get_end() >= pos) @@ -220,13 +177,11 @@ std::vector LocalGraph::walk_back( if (nodes.at(node_id)->pos.start + len <= pos) { p.initialize(Interval(pos - len, pos)); - // cout << "return path: " << p << endl; return_paths.push_back(std::make_shared(p)); return return_paths; } uint32_t len_added = std::min(pos - nodes.at(node_id)->pos.start, len); - // cout << "len: " << len << " len_added: " << len_added << endl; std::vector::iterator innode; if (len_added < len) { @@ -239,9 +194,7 @@ std::vector LocalGraph::walk_back( for (uint32_t i = 0; i != walk_paths.size(); ++i) { p2.initialize(*(walk_paths[i])); p2.add_end_interval(Interval(nodes.at(node_id)->pos.start, pos)); - // cout << p2 << endl; if (p2.length() == len) { - // cout << "output path: " << p2 << endl; return_paths.push_back(std::make_shared(p2)); } } @@ -306,9 +259,6 @@ std::vector LocalGraph::nodes_along_string( for (uint32_t j = 0; j != p.back()->outNodes.size(); ++j) { // if the start of query_string matches extended candidate_string, want // to query candidate path extensions - // if ( - // query_string.substr(0,candidate_string.size()+u[i].back()->outNodes[j]->seq.size()) - // == candidate_string+u[i].back()->outNodes[j]->seq) auto comp_string = candidate_string + p.back()->outNodes[j]->seq; auto comp_length = std::min(query_string.size(), comp_string.size()); if (strcasecmp(query_string.substr(0, comp_length).c_str(), @@ -406,8 +356,7 @@ std::vector LocalGraph::bottom_path() const bool LocalGraph::operator==(const LocalGraph& y) const { // false if have different numbers of nodes - if (y.nodes.size() - != nodes.size()) { // cout << "different numbers of nodes" << endl; + if (y.nodes.size() != nodes.size()) { return false; } @@ -415,17 +364,14 @@ bool LocalGraph::operator==(const LocalGraph& y) const for (const auto& c : nodes) { // if node id doesn't exist auto it = y.nodes.find(c.first); - if (it == y.nodes.end()) { // cout << "node id doesn't exist" << endl; + if (it == y.nodes.end()) { return false; } // or node entries are different if (!(*c.second == *(it->second))) { - // cout << "node id " << c.first << " exists but has different values" << - // endl; return false; } } - // otherwise is true return true; } diff --git a/src/localnode.cpp b/src/localnode.cpp index fc228f1d..b153d3e0 100644 --- a/src/localnode.cpp +++ b/src/localnode.cpp @@ -22,23 +22,18 @@ std::ostream& operator<<(std::ostream& out, LocalNode const& n) bool LocalNode::operator==(const LocalNode& y) const { - if (seq != y.seq) { // cout << "different seq" << endl; + if (seq != y.seq) { return false; } - // if (!(pos == y.pos)) {//cout << "different interval" << endl; - // return false;} - if (id != y.id) { // cout << "different id" << endl; + if (id != y.id) { return false; } - if (outNodes.size() - != y.outNodes.size()) { // cout << "differnet numbers of out edges" << endl; + if (outNodes.size() != y.outNodes.size()) { return false; } for (uint32_t i = 0; i != outNodes.size(); ++i) { spointer_values_equal eq = { outNodes[i] }; - if (find_if(y.outNodes.begin(), y.outNodes.end(), eq) - == y.outNodes.end()) { // cout << "the out edge points to a different node" - // << endl; + if (find_if(y.outNodes.begin(), y.outNodes.end(), eq) == y.outNodes.end()) { return false; } } diff --git a/src/minirecord.cpp b/src/minirecord.cpp index 8e0b84d0..a84b7b10 100644 --- a/src/minirecord.cpp +++ b/src/minirecord.cpp @@ -1,4 +1,3 @@ -#include #include #include #include "minirecord.h" @@ -16,8 +15,6 @@ MiniRecord::~MiniRecord() {}; bool MiniRecord::operator==(const MiniRecord& y) const { - // cout << prg_id << "," << y.prg_id << endl; - // cout << path << "," << y.path << endl; if (prg_id != y.prg_id) { return false; } diff --git a/src/noise_filtering.cpp b/src/noise_filtering.cpp index 4249d301..c22f76dc 100644 --- a/src/noise_filtering.cpp +++ b/src/noise_filtering.cpp @@ -4,12 +4,10 @@ #include #include #include -#include #include #include "utils.h" #include "pangenome/pangraph.h" #include "pangenome/pannode.h" -#include "pangenome/panread.h" #include "de_bruijn/graph.h" #include "minihit.h" @@ -88,20 +86,10 @@ bool overlap_backwards( std::deque rc_hashed_node_ids( const std::deque& hashed_node_ids) { - /*for (const auto &n : hashed_node_ids) - { - cout << n << " "; - }*/ std::deque d; for (const auto& i : hashed_node_ids) { d.push_front(rc_num(i)); } - /*cout << " is now "; - for (const auto &n : d) - { - cout << n << " "; - } - cout << endl;*/ return d; } @@ -168,17 +156,8 @@ void dbg_node_ids_to_ids_and_orientations(const debruijn::Graph& dbg, std::deque hashed_pg_node_ids = extend_hashed_pg_node_ids_backwards(dbg, dbg_node_ids); - if (hashed_pg_node_ids.empty()) - hashed_pg_node_ids = extend_hashed_pg_node_ids_forwards(dbg, dbg_node_ids); if (hashed_pg_node_ids.empty()) { - for (const auto& n : dbg_node_ids) { - std::cout << "("; - for (const auto& m : dbg.nodes.at(n)->hashed_node_ids) { - std::cout << m << " "; - } - std::cout << ") "; - } - std::cout << std::endl; + hashed_pg_node_ids = extend_hashed_pg_node_ids_forwards(dbg, dbg_node_ids); } assert(!hashed_pg_node_ids.empty()); hashed_node_ids_to_ids_and_orientations(hashed_pg_node_ids, node_ids, node_orients); @@ -189,7 +168,6 @@ void construct_debruijn_graph( { dbg.nodes.clear(); dbg.node_hash.clear(); - // cout << "Removed old nodes" << endl; debruijn::OrientedNodePtr prev, current; std::deque hashed_ids; @@ -224,10 +202,10 @@ void construct_debruijn_graph( void remove_leaves(std::shared_ptr pangraph, debruijn::Graph& dbg, uint_least32_t covg_thresh) { - std::cout << now() << "Remove leaves of debruijn graph from pangraph" << std::endl; - std::cout << "Start with " << pangraph->nodes.size() << " pg.nodes, " - << pangraph->reads.size() << " pg.reads, and " << dbg.nodes.size() - << " dbg.nodes" << std::endl; + BOOST_LOG_TRIVIAL(debug) << "Remove leaves of debruijn graph from pangraph"; + BOOST_LOG_TRIVIAL(debug) << "Start with " << pangraph->nodes.size() << " pg.nodes, " + << pangraph->reads.size() << " pg.reads, and " + << dbg.nodes.size() << " dbg.nodes"; bool leaves_exist = true; std::unordered_set leaves; std::vector node_ids; @@ -238,41 +216,25 @@ void remove_leaves(std::shared_ptr pangraph, debruijn::Graph& while (leaves_exist) { leaves = dbg.get_leaves(covg_thresh); - std::cout << "there are " << leaves.size() << " leaves" << std::endl; + BOOST_LOG_TRIVIAL(trace) << "there are " << leaves.size() << " leaves"; if (leaves.empty()) { leaves_exist = false; } - std::cout << "leaves exist is " << leaves_exist << std::endl; + BOOST_LOG_TRIVIAL(trace) << "leaves exist is " << leaves_exist; for (const auto& i : leaves) { - std::cout << std::endl << "looking at leaf " << i << ": "; - for (const auto& j : dbg.nodes[i]->hashed_node_ids) { - std::cout << j << " "; - } - std::cout << std::endl; - // look up the node ids and orientations associated with this node hashed_node_ids_to_ids_and_orientations( dbg.nodes[i]->hashed_node_ids, node_ids, node_orients); - std::cout << "looked up node ids" << std::endl; // remove the last node from corresponding reads assert(not dbg.nodes[i]->read_ids.empty()); for (const auto& r : dbg.nodes[i]->read_ids) { - std::cout << "remove from read " << r << ": "; - for (const auto& n : pangraph->reads[r]->get_nodes()) { - std::cout << n.lock()->node_id << " "; - } - std::cout << std::endl; if (pangraph->reads[r]->get_nodes().size() == dbg.size) { - std::cout << "remove read"; pangraph->remove_read(r); - std::cout << " done" << std::endl; } else { - std::cout << "remove from read "; pos = pangraph->reads[r]->find_position(node_ids, node_orients); - std::cout << " pos " << pos.first << " " << pos.second; assert(pos.first == 0 or pos.first + node_ids.size() == pangraph->reads[r]->get_nodes().size()); @@ -288,11 +250,6 @@ void remove_leaves(std::shared_ptr pangraph, debruijn::Graph& (--pangraph->reads[r]->get_nodes().end())->lock()->node_id); node.lock()->remove_read(pangraph->reads[r]); } - std::cout << "read is now " << r << ": "; - for (const auto& n : pangraph->reads[r]->get_nodes()) { - std::cout << n.lock()->node_id << " "; - } - std::cout << "done" << std::endl; } } auto node_shared_ptr_from_weak_ptr = node.lock(); @@ -303,13 +260,11 @@ void remove_leaves(std::shared_ptr pangraph, debruijn::Graph& // remove dbg node dbg.remove_node(i); - - // cout << "pg is now: " << endl << pg << endl; } } - std::cout << "There are now " << pangraph->nodes.size() << " pg.nodes, " - << pangraph->reads.size() << " pg.reads, and " << dbg.nodes.size() - << " dbg.nodes" << std::endl; + BOOST_LOG_TRIVIAL(debug) << "There are now " << pangraph->nodes.size() + << " pg.nodes, " << pangraph->reads.size() + << " pg.reads, and " << dbg.nodes.size() << " dbg.nodes"; } void find_reads_along_tig(const debruijn::Graph& dbg, @@ -323,16 +278,9 @@ void find_reads_along_tig(const debruijn::Graph& dbg, reads_along_tig.insert(pangraph->reads.at(r)); } } - std::cout << "candidate reads "; - for (const auto& r : reads_along_tig) { - std::cout << r->id << " "; - } - std::cout << std::endl; - // filter out some which don't really overlap the unitig, keeping those // which overlap at least consecutive 2 dbg nodes or only one node all_reads_along_tig = true; - std::cout << "kept reads along tig: "; for (auto r = reads_along_tig.begin(); r != reads_along_tig.end();) { if ((*r)->get_nodes().size() > dbg.size and (*r)->find_position(pg_node_ids, pg_node_orients, dbg.size + 1).first @@ -340,11 +288,9 @@ void find_reads_along_tig(const debruijn::Graph& dbg, r = reads_along_tig.erase(r); all_reads_along_tig = false; } else { - std::cout << (*r)->id << " "; ++r; } } - // cout << endl; } void remove_middle_nodes_of_tig_from_read(std::shared_ptr pangenome, @@ -357,7 +303,7 @@ void remove_middle_nodes_of_tig_from_read(std::shared_ptr pang // if pos = 0 (ie read does not include start of tig) std::pair pos = r->find_position(node_ids, node_orients); - std::cout << "found pos " << pos.first << " " << pos.second << std::endl; + BOOST_LOG_TRIVIAL(trace) << "found pos " << pos.first << " " << pos.second; auto start_shift = pos.first; if (pos.first > 0 or pos.second < r->get_nodes().size() - 1 or node_ids.size() == r->get_nodes().size()) @@ -389,7 +335,7 @@ void remove_middle_nodes_of_tig_from_read(std::shared_ptr pang end_shift = sub_pos.second + 1; } - std::cout << "remove nodes " << start_shift << " to " << end_shift << std::endl; + BOOST_LOG_TRIVIAL(trace) << "remove nodes " << start_shift << " to " << end_shift; auto it = r->get_nodes().begin() + start_shift; for (auto shift = start_shift; shift < end_shift; ++shift) { if (it == r->get_nodes().end()) { @@ -408,22 +354,17 @@ void remove_middle_nodes_of_tig_from_read(std::shared_ptr pang void filter_unitigs(std::shared_ptr pangraph, debruijn::Graph& dbg, const uint_least32_t& threshold) { - std::cout << now() << "Filter unitigs using threshold " << threshold << std::endl; + BOOST_LOG_TRIVIAL(debug) << "Filter unitigs using threshold " << threshold; std::vector node_ids; std::vector node_orients; std::unordered_set reads_along_tig; bool all_reads_tig; std::set> unitigs = dbg.get_unitigs(); - std::cout << "have " << unitigs.size() << " tigs" << std::endl; + BOOST_LOG_TRIVIAL(debug) << "have " << unitigs.size() << " tigs"; for (auto d : unitigs) { // look up the node ids and orientations associated with this node dbg_node_ids_to_ids_and_orientations(dbg, d, node_ids, node_orients); - std::cout << "tig: "; - for (const auto& n : node_ids) { - std::cout << n << " "; - } - std::cout << std::endl; // collect the reads covering that tig find_reads_along_tig( @@ -432,31 +373,18 @@ void filter_unitigs(std::shared_ptr pangraph, debruijn::Graph& // now if the number of reads covering tig falls below threshold, remove the // middle nodes of this tig from the reads if (reads_along_tig.size() <= threshold) { - std::cout << "not enough reads, so remove the tig from the reads" - << std::endl; + BOOST_LOG_TRIVIAL(trace) + << "not enough reads, so remove the tig from the reads"; for (const auto& r : reads_along_tig) { - std::cout << "read " << r->id << " was "; - for (const auto& n : r->get_nodes()) { - std::cout << n.lock()->node_id << " "; - } - std::cout << std::endl; remove_middle_nodes_of_tig_from_read( pangraph, dbg, r, node_ids, node_orients); - std::cout << "read " << r->id << " is now "; - for (const auto& n : r->get_nodes()) { - std::cout << n.lock()->node_id << " "; - } - std::cout << std::endl; } // also remove read_ids from each of the corresponding nodes of dbg - // cout << "now remove read from dbg nodes" << endl; for (uint32_t i = 1; i < d.size() - 1; ++i) { for (const auto& r : reads_along_tig) { dbg.remove_read_from_node(r->id, d[i]); } } - } else { - std::cout << "tig had enough reads" << std::endl; } reads_along_tig.clear(); } @@ -465,7 +393,7 @@ void filter_unitigs(std::shared_ptr pangraph, debruijn::Graph& void detangle_pangraph_with_debruijn_graph( std::shared_ptr pangraph, debruijn::Graph& dbg) { - std::cout << now() << "Detangle pangraph with debruijn graph" << std::endl; + BOOST_LOG_TRIVIAL(debug) << "Detangle pangraph with debruijn graph"; std::vector node_ids; std::vector node_orients; bool all_reads_tig; @@ -475,13 +403,6 @@ void detangle_pangraph_with_debruijn_graph( for (auto d : unitigs) { // look up the node ids and orientations associated with this node dbg_node_ids_to_ids_and_orientations(dbg, d, node_ids, node_orients); - /*cout << "tig: "; - for (const auto &n : node_ids) - { - cout << n << " "; - } - cout << endl;*/ - // collect the reads covering that tig find_reads_along_tig( dbg, d, pangraph, node_ids, node_orients, reads_along_tig, all_reads_tig); @@ -490,20 +411,14 @@ void detangle_pangraph_with_debruijn_graph( // if we find a read which doesn't lie along whole tig, // split that node by reads and create a new node on the tig if (!all_reads_tig and !reads_along_tig.empty()) { - // cout << "not all reads contain tig" << endl; for (uint32_t i = 0; i < node_ids.size(); ++i) { - // cout << "for node " << pg->nodes[node_ids[i]]->node_id << endl; for (const auto& r : pangraph->nodes[node_ids[i]]->reads) { - // cout << "for read " << r->id << endl; if (reads_along_tig.find(r) == reads_along_tig.end()) { - // cout << "split node" << endl; pangraph->split_node_by_reads( reads_along_tig, node_ids, node_orients, node_ids[i]); - // cout << "done" << endl; break; } } - // cout << pg << endl; } } } @@ -512,21 +427,21 @@ void detangle_pangraph_with_debruijn_graph( void clean_pangraph_with_debruijn_graph(std::shared_ptr pangraph, const uint_least32_t size, const uint_least32_t threshold, const bool illumina) { - std::cout << now() << "Construct de Bruijn Graph from PanGraph with size " - << (uint32_t)size << std::endl; + BOOST_LOG_TRIVIAL(debug) << "Construct de Bruijn Graph from PanGraph with size " + << (uint32_t)size; debruijn::Graph dbg(size); construct_debruijn_graph(pangraph, dbg); if (not illumina) remove_leaves(pangraph, dbg, threshold); filter_unitigs(pangraph, dbg, threshold); - std::cout << "Finished filtering tigs" << std::endl; + BOOST_LOG_TRIVIAL(debug) << "Finished filtering tigs"; // update dbg now that have removed leaves and some inner nodes - std::cout << "Reconstruct dbg" << std::endl; + BOOST_LOG_TRIVIAL(trace) << "Reconstruct dbg"; construct_debruijn_graph(pangraph, dbg); - std::cout << "Now detangle" << std::endl; + BOOST_LOG_TRIVIAL(trace) << "Now detangle"; detangle_pangraph_with_debruijn_graph(pangraph, dbg); } diff --git a/src/pangenome/pangraph.cpp b/src/pangenome/pangraph.cpp index 9f668f83..3cd3bd2e 100644 --- a/src/pangenome/pangraph.cpp +++ b/src/pangenome/pangraph.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -14,8 +13,6 @@ #include "pangenome/pannode.h" #include "pangenome/panread.h" #include "pangenome/pansample.h" -#include "minihit.h" -#include "kmergraphwithcoverage.h" #include "fastaq_handler.h" #define assert_msg(x) !(std::cerr << "Assertion failed: " << x << std::endl) @@ -150,7 +147,6 @@ void pangenome::Graph::add_hits_between_PRG_and_sample( // Remove the node n, and all references to it std::unordered_map::iterator pangenome::Graph::remove_node(NodePtr n) { - // cout << "Remove graph node " << *n << endl; // removes all instances of node n and references to it in reads for (const auto& r : n->reads) { r->remove_all_nodes_with_this_id(n->node_id); @@ -169,7 +165,6 @@ std::unordered_map::iterator pangenome::Graph::remove_node(No void pangenome::Graph::remove_read(const uint32_t read_id) { for (const auto& n : reads[read_id]->nodes) { - // cout << "looking at read node " << n->node_id; auto nSharedPtr = n.lock(); nSharedPtr->covg -= 1; nSharedPtr->reads.erase(reads[read_id]); @@ -209,11 +204,8 @@ void pangenome::Graph::remove_low_covg_nodes(const uint32_t& thresh) { BOOST_LOG_TRIVIAL(debug) << "Remove nodes with covg <= " << thresh << std::endl; for (auto it = nodes.begin(); it != nodes.end();) { - // cout << "look at node " << *(it->second) << endl; if (it->second->covg <= thresh) { - // cout << "delete node " << it->second->name; it = remove_node(it->second); - // cout << " so pangraph now has " << nodes.size() << " nodes" << endl; } else { ++it; } @@ -262,10 +254,7 @@ void pangenome::Graph::split_node_by_reads(std::unordered_set& reads_al // replace the node in the read if (it != r->nodes.end()) { - // cout << "replace node " << (*it)->node_id << " in this read" << endl; - // cout << "read was " << *r << endl; r->replace_node_with_iterator(it, n); - // cout << "read is now " << *r << endl; nodes[node_id]->reads.erase(rit); nodes[node_id]->covg -= 1; if (nodes[node_id]->covg == 0) { @@ -273,9 +262,6 @@ void pangenome::Graph::split_node_by_reads(std::unordered_set& reads_al } n->reads.insert(r); n->covg += 1; - //} else { - // cout << "read does not contain this node in context of the tig" << - // endl; } } @@ -288,34 +274,6 @@ void pangenome::Graph::split_node_by_reads(std::unordered_set& reads_al } } -/*Graph::unordered_set find_reads_on_node_path(const vector -node_path_ids, const vector node_path_orients) -{ - unordered_set reads_on_node_path; - - // collect all reads on nodes - for (const auto &i : node_path_ids) - { - for (const auto &r : nodes[n]->reads) - { - reads_on_node_path.insert(r); - } - } - - // remove reads which deviate from path - for (auto rit = reads_on_node_path.begin(); rit != reads_on_node_path.end();) - { - if ((*rit)->find_position(node_path_ids, node_path_orients) == -std::numeric_limits::max()) - { - rit = reads_on_node_path.erase(rit); - } else { - rit++; - } - } - return reads_on_node_path; -}*/ - // For each node in pangraph, make a copy of the kmergraph and use the hits // stored on each read containing the node to add coverage to this graph void pangenome::Graph::add_hits_to_kmergraphs( @@ -467,12 +425,6 @@ bool same_prg_id::operator()(const std::pair& n) const bool pangenome::Graph::operator==(const Graph& y) const { - // false if have different numbers of nodes - /*if (y.nodes.size() != nodes.size()) { - cout << "different num nodes " << nodes.size() << "!=" << y.nodes.size() << - endl; return false; - }*/ - // false if have different nodes for (const auto& c : nodes) { // if node id doesn't exist @@ -538,9 +490,9 @@ void pangenome::Graph::save_mapped_read_strings( // for each node in pangraph, find overlaps and write to a file std::vector> read_overlap_coordinates; for (const auto& node_ptr : nodes) { - std::cout << "Find coordinates for node " << node_ptr.second->name; + BOOST_LOG_TRIVIAL(debug) + << "Find coordinates for node " << node_ptr.second->name; node_ptr.second->get_read_overlap_coordinates(read_overlap_coordinates); - std::cout << "." << std::endl; fs::create_directories(outdir + "/" + node_ptr.second->get_name()); outhandle.open(outdir + "/" + node_ptr.second->get_name() + "/" + node_ptr.second->get_name() + ".reads.fa"); @@ -571,13 +523,12 @@ void pangenome::Graph::save_mapped_read_strings( std::ostream& pangenome::operator<<(std::ostream& out, const pangenome::Graph& m) { - // cout << "printing pangraph" << endl; for (const auto& n : m.nodes) { - std::cout << n.second->prg_id << std::endl; - std::cout << *n.second << std::endl; + out << n.second->prg_id << std::endl; + out << *n.second << std::endl; } for (const auto& n : m.reads) { - std::cout << *(n.second) << std::endl; + out << *(n.second) << std::endl; } return out; diff --git a/src/pangenome/panread.cpp b/src/pangenome/panread.cpp index 4dcd54c1..6a87e50e 100644 --- a/src/pangenome/panread.cpp +++ b/src/pangenome/panread.cpp @@ -1,7 +1,5 @@ #include -#include #include -#include #include #include #include @@ -110,42 +108,21 @@ std::pair Read::find_position( const std::vector& node_ids, const std::vector& node_orients, const uint16_t min_overlap) { - /*cout << "searching for "; - for (const auto &n : node_ids) - { - cout << n << " "; - } - cout << " in "; - for (const auto &n : nodes) - { - cout << n->node_id << " "; - } - cout << endl;*/ - assert(node_ids.size() == node_orients.size()); assert(not node_ids.empty()); uint32_t search_pos = 0; uint32_t found_pos = 0; - // cout << "searching forwards for " << node_ids[0] << " " << node_orients[0]; - // cout << " and backwards for " << node_ids.back() << " " << !node_orients.back() - // << endl; - for (uint32_t i = 0; i < nodes.size(); ++i) { - // cout << "compare nodes pos " << i << " with node_ids 0" << endl; // if first node matches at position i going forwards... if (nodes[i].lock()->node_id == node_ids[0] and node_orientations[i] == node_orients[0]) { - // cout << "start node " << i << " fwd " << nodes[i]->node_id << " " << - // node_orientations[i]; cout << " matches " << node_ids[0] << " and " << - // node_orients[0] << endl; search_pos = 0; found_pos = 0; while (i + found_pos < nodes.size() and nodes[i + found_pos].lock()->node_id == node_ids[search_pos] and node_orientations[i + found_pos] == node_orients[search_pos]) { - // cout << "fwd " << search_pos << " " << i + found_pos << endl; if (search_pos == node_ids.size() - 1 or i + found_pos == nodes.size() - 1) { if (found_pos + 1 >= min_overlap) { @@ -157,30 +134,20 @@ std::pair Read::find_position( search_pos++; found_pos++; } - // cout << "end fwd" << endl; } // if i+node_ids.size() is over the end of nodes, consider partial matches which // skip the first nodes - // cout << "compare nodes pos " << 0 << " with node_ids " << i + node_ids.size() - // - nodes.size() << endl; if (i + node_ids.size() > nodes.size() and nodes[0].lock()->node_id == node_ids[i + node_ids.size() - nodes.size()] and node_orientations[0] == node_orients[i + node_orients.size() - nodes.size()]) { - // cout << "start node " << i << " truncated fwd " << nodes[0]->node_id; - // cout << " " << node_orientations[0]; - // cout << " matches " << node_ids[i + node_ids.size() - nodes.size()]; - // cout << " and " << node_orients[i + node_ids.size() - nodes.size()] << - // endl; - search_pos = i + node_ids.size() - nodes.size(); found_pos = 0; while (found_pos < nodes.size() and nodes[found_pos].lock()->node_id == node_ids[search_pos] and node_orientations[found_pos] == node_orients[search_pos]) { - // cout << "fwd " << search_pos << " " << found_pos << endl; if (search_pos == node_ids.size() - 1 or found_pos == nodes.size() - 1) { if (found_pos + 1 >= min_overlap) { @@ -194,14 +161,9 @@ std::pair Read::find_position( } } - // cout << "compare nodes pos " << nodes.size() -1 -i << " with node_ids " << 0 - // << endl; if (nodes[nodes.size() - 1 - i].lock()->node_id == node_ids[0] and node_orientations[node_orientations.size() - 1 - i] == !node_orients[0]) { - // cout << "start node " << i << " bwd " << nodes[nodes.size() -1 - // -i]->node_id; cout << " " << node_orientations[nodes.size() -1 -i]; cout - // << " matches " << node_ids[0] << " and " << !node_orients[0] << endl; search_pos = 0; found_pos = 0; @@ -210,8 +172,6 @@ std::pair Read::find_position( == node_ids[search_pos] and node_orientations[nodes.size() - 1 - i - found_pos] == !node_orients[search_pos]) { - // cout << "bwd " << search_pos << " " << nodes.size() -1 -i -found_pos - // << endl; if (search_pos == node_ids.size() - 1 or i + 1 + found_pos == nodes.size()) { if (found_pos + 1 >= min_overlap) { @@ -224,29 +184,15 @@ std::pair Read::find_position( search_pos++; found_pos++; } - // cout << "end bwd" << endl; } // if we are considering matches which overlap the start backwards, also // consider ones which overlap the end backwards by the same amount - // cout << "compare nodes pos " << nodes.size() -1 << " with node_ids " << - // nodes.size() -1 - i << endl; if (i + node_ids.size() > nodes.size() - // and nodes[nodes.size() -1 -i]->node_id == node_ids[i + node_ids.size() - - // nodes.size()] and node_orientations[node_orientations.size() -1 -i] == - // !node_orients[i + node_orients.size() - nodes.size()]) and nodes.back().lock()->node_id == node_ids[i + node_ids.size() - nodes.size()] and node_orientations.back() == !node_orients[i + node_orients.size() - nodes.size()]) { - // cout << "start node " << i << " truncated bwd " << nodes.back()->node_id; - // cout << " " << node_orientations.back(); - // //cout << " matches " << node_ids[node_ids.size() - nodes.size()-1 - i]; - // //cout << " and " << !node_orients[node_ids.size() - nodes.size()-1 - i] - // << endl; - // cout << " matches " << node_ids[i + node_ids.size() - nodes.size()]; - // cout << " and " << !node_orients[i + node_ids.size() - nodes.size()] << - // endl; search_pos = i + node_ids.size() - nodes.size(); found_pos = 0; @@ -255,7 +201,6 @@ std::pair Read::find_position( == node_ids[search_pos] and node_orientations[nodes.size() - 1 - found_pos] == !node_orients[search_pos]) { - // cout << "bwd " << search_pos << " " << found_pos << endl; if (search_pos == node_ids.size() - 1 or i + 1 + found_pos == nodes.size()) { if (found_pos + 1 >= min_overlap) { @@ -289,7 +234,6 @@ void Read::remove_all_nodes_with_this_id(const uint32_t node_id) std::vector::iterator Read::remove_node_with_iterator( std::vector::iterator nit) { - //(*nit)->covg -= 1; uint32_t d = distance(nodes.begin(), nit); node_orientations.erase(node_orientations.begin() + d); nit = nodes.erase(nit); @@ -299,11 +243,8 @@ std::vector::iterator Read::remove_node_with_iterator( void Read::replace_node_with_iterator( std::vector::iterator n_original, NodePtr n) { - // hits[n->node_id].insert(hits[(*n_original)->node_id].begin(),hits[(*n_original)->node_id].end() - // ); auto it = nodes.erase(n_original); nodes.insert(it, n); - // hits.erase((*n_original)->node_id); } bool Read::operator==(const Read& y) const diff --git a/src/prg/path.cpp b/src/prg/path.cpp index 4d39988a..39f82bb1 100644 --- a/src/prg/path.cpp +++ b/src/prg/path.cpp @@ -69,8 +69,7 @@ std::vector prg::Path::nodes_along_path(const LocalPRG& localPrg) prg::Path prg::Path::subpath(const uint32_t start, const uint32_t len) const { // function now returns the path starting at position start along the path, rather - // than at position start on linear PRG, and for length len cout << "find subpath of - // " << *this << " from " << start << " for length " << len << endl; + // than at position start on linear PRG, and for length len assert(start + len <= length()); prg::Path p; std::deque d; @@ -87,17 +86,12 @@ prg::Path prg::Path::subpath(const uint32_t start, const uint32_t len) const p.initialize(d); added_len += std::min(len - added_len, interval.length - start + covered_length); - /// cout << "added first interval " << p.path.back() << " and added length - /// is now " << added_len << endl; } else if (covered_length >= start and added_len <= len) { p.add_end_interval(Interval(interval.start, std::min(interval.get_end(), interval.start + len - added_len))); added_len += std::min(len - added_len, interval.length); - // cout << "added interval " << p.path.back() << " and added length is now " - // << added_len << endl; } covered_length += interval.length; - // cout << "covered length is now " << covered_length << endl; if (added_len >= len) { break; } @@ -121,7 +115,6 @@ bool prg::Path::is_branching(const prg::Path& y) if (overlap) { if (it->start != it2->start) { // had paths which overlapped and now don't - // cout << "branch" << endl; return true; // represent branching paths at this point } ++it2; @@ -131,17 +124,14 @@ bool prg::Path::is_branching(const prg::Path& y) } } else { for (it2 = y.path.begin(); it2 != y.path.end(); ++it2) { - // cout << *it << " " << *it2 << endl; if ((it->get_end() > it2->start and it->start < it2->get_end()) or (*it == *it2)) { // then the paths overlap overlap = true; - // cout << "overlap" << endl; // first query the previous intervals if they exist, to see if they // overlap if (it != path.begin() && it2 != y.path.begin() && (--it)->get_end() != (--it2)->get_end()) { - // cout << "coal" << endl; return true; // represent coalescent paths at this point } else { ++it2; @@ -163,9 +153,6 @@ bool prg::Path::is_subpath(const prg::Path& big_path) const { if (big_path.length() < length() or big_path.get_start() > get_start() or big_path.get_end() < get_end() or is_branching(big_path)) { - // cout << "fell at first hurdle " << (big_path.length() < length()); - // cout << (big_path.start > start) << (big_path.end < end); - // cout << (is_branching(big_path)) << endl; return false; } diff --git a/src/seq.cpp b/src/seq.cpp index 5331ee4b..646a24e1 100644 --- a/src/seq.cpp +++ b/src/seq.cpp @@ -75,7 +75,6 @@ void Seq::add_minimizing_kmers_to_sketch( for (const auto& minimizer : window) { if (minimizer.canonical_kmer_hash == smallest) { sketch.insert(minimizer); - // num_minis_found += 1; } } } @@ -138,7 +137,6 @@ void Seq::minimizer_sketch(const uint32_t w, const uint32_t k) "still has size " << window.size())); } - // cout << now() << "Sketch size " << sketch.size() << " for read " << name << endl; } std::ostream& operator<<(std::ostream& out, Seq const& data) diff --git a/src/utils.cpp b/src/utils.cpp index 206cf71c..4c12119d 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -1,13 +1,10 @@ #include -#include #include #include #include #include -#include #include #include -#include #include #include #include @@ -18,7 +15,6 @@ #include "seq.h" #include "localPRG.h" #include "pangenome/pangraph.h" -#include "pangenome/panread.h" #include "noise_filtering.h" #include "minihit.h" #include "fastaq_handler.h" @@ -76,7 +72,6 @@ char complement(char n) case 'c': return 'G'; } - // assert(false); return 'N'; } @@ -134,7 +129,7 @@ void read_prg_file(std::vector>& prgs, prgs.push_back(s); id++; } else { - std::cerr << "Failed to make LocalPRG for " << fh.name << std::endl; + BOOST_LOG_TRIVIAL(error) << "Failed to make LocalPRG for " << fh.name; exit(1); } } @@ -151,12 +146,10 @@ void load_PRG_kmergraphs(std::vector>& prgs, prefix += prgfile.substr(0, pos); prefix += "/"; } - // cout << "prefix for kmerprgs dir is " << prefix << endl; auto dir_num = 0; std::string dir; for (const auto& prg : prgs) { - // cout << "Load kmergraph for " << prg->name << endl; if (prg->id % 4000 == 0) { dir = prefix + "kmer_prgs/" + int_to_string(dir_num + 1); dir_num++; @@ -189,9 +182,6 @@ void load_vcf_refs_file(const std::string& filepath, VCFRefs& vcf_refs) void add_read_hits(const Seq& sequence, const std::shared_ptr& minimizer_hits, const Index& index) { - // cout << now() << "Search for hits for read " << s->name << " which has sketch - // size " << s->sketch.size() << " against index of size " << idx->minhash.size() << - // endl; uint32_t hit_count = 0; // creates Seq object for the read, then looks up minimizers in the Seq sketch and // adds hits to a global MinimizerHits object // Seq s(id, name, seq, w, k); @@ -202,14 +192,9 @@ void add_read_hits(const Seq& sequence, // yes, add all hits of this minimizer hit to this kmer for (const MiniRecord& miniRecord : *(minhashIt->second)) { minimizer_hits->add_hit(sequence.id, *sequenceSketchIt, miniRecord); - //++hit_count; } } } - // hits->sort(); - // cout << now() << "Found " << hit_count << " hits found for read " << s->name << " - // so size of MinimizerHits is now " - // << hits->hits.size() + hits->uhits.size() << endl; } void define_clusters(std::set& clusters_of_hits, @@ -218,7 +203,7 @@ void define_clusters(std::set& clusters_of_hit const float& fraction_kmers_required_for_cluster, const uint32_t min_cluster_size, const uint32_t expected_number_kmers_in_short_read_sketch) { - BOOST_LOG_TRIVIAL(debug) << "Define clusters of hits from the " + BOOST_LOG_TRIVIAL(trace) << "Define clusters of hits from the " << minimizer_hits->hits.size() << " hits"; if (minimizer_hits->hits.empty()) { @@ -245,7 +230,7 @@ void define_clusters(std::set& clusters_of_hit prgs[(*mh_previous)->get_prg_id()]->kmer_prg.min_path_length(), expected_number_kmers_in_short_read_sketch) * fraction_kmers_required_for_cluster; - BOOST_LOG_TRIVIAL(debug) + BOOST_LOG_TRIVIAL(trace) << "Length based cluster threshold min(" << prgs[(*mh_previous)->get_prg_id()]->kmer_prg.min_path_length() << ", " << expected_number_kmers_in_short_read_sketch << ") * " @@ -255,9 +240,8 @@ void define_clusters(std::set& clusters_of_hit if (current_cluster.size() > std::max(length_based_threshold, min_cluster_size)) { clusters_of_hits.insert(current_cluster); - // cout << "Found cluster of size " << current_cluster.size() << endl; } else { - BOOST_LOG_TRIVIAL(debug) + BOOST_LOG_TRIVIAL(trace) << "Rejected cluster of size " << current_cluster.size() << " < max(" << length_based_threshold << ", " << min_cluster_size << ")"; @@ -271,7 +255,7 @@ void define_clusters(std::set& clusters_of_hit = std::min(prgs[(*mh_previous)->get_prg_id()]->kmer_prg.min_path_length(), expected_number_kmers_in_short_read_sketch) * fraction_kmers_required_for_cluster; - BOOST_LOG_TRIVIAL(debug) + BOOST_LOG_TRIVIAL(trace) << "Length based cluster threshold min(" << prgs[(*mh_previous)->get_prg_id()]->kmer_prg.min_path_length() << ", " << expected_number_kmers_in_short_read_sketch << ") * " @@ -279,39 +263,27 @@ void define_clusters(std::set& clusters_of_hit if (current_cluster.size() > std::max(length_based_threshold, min_cluster_size)) { clusters_of_hits.insert(current_cluster); } else { - BOOST_LOG_TRIVIAL(debug) + BOOST_LOG_TRIVIAL(trace) << "Rejected cluster of size " << current_cluster.size() << " < max(" << length_based_threshold << ", " << min_cluster_size << ")"; } - BOOST_LOG_TRIVIAL(debug) << "Found " << clusters_of_hits.size() + BOOST_LOG_TRIVIAL(trace) << "Found " << clusters_of_hits.size() << " clusters of hits"; } void filter_clusters(std::set& clusters_of_hits) { // Next order clusters, choose between those that overlap by too much - BOOST_LOG_TRIVIAL(debug) << "Filter the " << clusters_of_hits.size() + BOOST_LOG_TRIVIAL(trace) << "Filter the " << clusters_of_hits.size() << " clusters of hits"; if (clusters_of_hits.empty()) { return; } // to do this consider pairs of clusters in turn auto c_previous = clusters_of_hits.begin(); - /*cout << "first cluster" << endl; - for (set::iterator p=c_previous->begin(); - p!=c_previous->end(); ++p) - { - cout << **p << endl; - }*/ for (auto c_current = ++clusters_of_hits.begin(); c_current != clusters_of_hits.end(); ++c_current) { - /*cout << "current cluster" << endl; - for (set::iterator p=c_current->begin(); - p!=c_current->end(); ++p) - { - cout << **p << endl; - }*/ if (((*(*c_current).begin())->get_read_id() == (*(*c_previous).begin())->get_read_id()) && // if on same read and either @@ -330,16 +302,14 @@ void filter_clusters(std::set& clusters_of_hit { if (c_previous->size() >= c_current->size()) { clusters_of_hits.erase(c_current); - // cout << "erase current" << endl; c_current = c_previous; } else { clusters_of_hits.erase(c_previous); - // cout << "erase previous" << endl; } } c_previous = c_current; } - BOOST_LOG_TRIVIAL(debug) << "Now have " << clusters_of_hits.size() + BOOST_LOG_TRIVIAL(trace) << "Now have " << clusters_of_hits.size() << " clusters of hits"; } @@ -348,7 +318,7 @@ void filter_clusters2(std::set& clusters_of_hi { // Sort clusters by size, and filter out those small clusters which are entirely // contained in bigger clusters on reads - BOOST_LOG_TRIVIAL(debug) << "Filter2 the " << clusters_of_hits.size() + BOOST_LOG_TRIVIAL(trace) << "Filter2 the " << clusters_of_hits.size() << " clusters of hits"; if (clusters_of_hits.empty()) { return; @@ -359,25 +329,18 @@ void filter_clusters2(std::set& clusters_of_hi auto it = clusters_by_size.begin(); std::vector read_v(genome_size, 0); - // cout << "fill from " << (*(it->begin()))->read_start_position() << " to " << - // (*--(it->end()))->read_start_position() << endl; fill(read_v.begin() + (*(it->begin()))->get_read_start_position(), read_v.begin() + (*--(it->end()))->get_read_start_position(), 1); bool contained; for (auto it_next = ++clusters_by_size.begin(); it_next != clusters_by_size.end(); ++it_next) { - // cout << "read id " << (*(it_next->begin()))->get_prg_id() << endl; if ((*(it_next->begin()))->get_read_id() == (*(it->begin()))->get_read_id()) { // check if have any 0s in interval of read_v between first and last contained = true; for (uint32_t i = (*(it_next->begin()))->get_read_start_position(); i < (*--(it_next->end()))->get_read_start_position(); ++i) { - // cout << i << ":" << read_v[i] << "\t"; if (read_v[i] == 0) { contained = false; - // cout << "found unique element at read position " << i << endl; - // cout << "fill from " << i << " to " << - // (*--(it_next->end()))->get_read_start_position() << endl; fill(read_v.begin() + i, read_v.begin() + (*--(it_next->end()))->get_read_start_position(), @@ -385,20 +348,16 @@ void filter_clusters2(std::set& clusters_of_hi break; } } - // cout << endl; if (contained) { - // cout << "erase cluster so clusters_of_hits has size decrease from " - // << clusters_of_hits.size(); clusters_of_hits.erase(*it_next); - // cout << " to " << clusters_of_hits.size() << endl; } } else { - // cout << "consider new read" << endl; + // consider new read fill(read_v.begin(), read_v.end(), 0); } ++it; } - BOOST_LOG_TRIVIAL(debug) << "Now have " << clusters_of_hits.size() + BOOST_LOG_TRIVIAL(trace) << "Now have " << clusters_of_hits.size() << " clusters of hits"; } @@ -407,7 +366,7 @@ void add_clusters_to_pangraph( std::shared_ptr pangraph, const std::vector>& prgs) { - BOOST_LOG_TRIVIAL(debug) << "Add inferred order to PanGraph"; + BOOST_LOG_TRIVIAL(trace) << "Add inferred order to PanGraph"; if (clusters_of_hits.empty()) { return; } @@ -645,6 +604,7 @@ uint32_t strtogs(const char* str) return (uint32_t)(x + .499); } -std::string transform_cli_gsize(std::string str) { +std::string transform_cli_gsize(std::string str) +{ return int_to_string(strtogs(str.c_str())); } diff --git a/src/vcf.cpp b/src/vcf.cpp index 9b2f0a8a..fdd858f8 100644 --- a/src/vcf.cpp +++ b/src/vcf.cpp @@ -78,7 +78,6 @@ ptrdiff_t VCF::get_sample_index(const std::string& name) // if this sample has not been added before, add a column for it auto sample_it = find(samples.begin(), samples.end(), name); if (sample_it == samples.end()) { - // cout << "this is the first time this sample has been added" << endl; samples.push_back(name); for (auto& record_ptr : records) { record_ptr->add_new_samples(1); @@ -95,14 +94,10 @@ void VCF::add_a_new_record_discovered_in_a_sample_and_genotype_it( const std::string& sample_name, const std::string& chrom, const uint32_t pos, const std::string& ref, const std::string& alt) { - // cout << "adding gt " << chrom << " " << pos << " " << ref << " vs " << - // sample_name << " " << alt << endl; if (ref == "" and alt == "") { return; } - // cout << "adding gt " << ref << " vs " << alt << endl; - ptrdiff_t sample_index = get_sample_index(sample_name); VCFRecord vcf_record(this, chrom, pos, ref, alt); @@ -112,14 +107,11 @@ void VCF::add_a_new_record_discovered_in_a_sample_and_genotype_it( vcf_record); // TODO: improve this search to log(n) using alt map or sth bool vcf_record_was_found = vcf_record_iterator != records.end(); if (vcf_record_was_found) { - // cout << "found record with this ref and alt" << endl; (*vcf_record_iterator) ->sampleIndex_to_sampleInfo[sample_index] .set_gt_from_max_likelihood_path(1); vcf_record_pointer = vcf_record_iterator->get(); } else { - // cout << "didn't find alt record for pos " << pos << " ref " << ref << " and - // alt " << alt << endl; // either we have the ref allele, an alternative allele for alt too nested site, // or alt mistake bool vcf_record_was_processed = false; @@ -137,7 +129,6 @@ void VCF::add_a_new_record_discovered_in_a_sample_and_genotype_it( } } } else { - // cout << "have new allele not in graph" << endl; add_record( chrom, pos, ref, alt, "SVTYPE=COMPLEX", "GRAPHTYPE=TOO_MANY_ALTS"); records.back() @@ -174,8 +165,6 @@ void VCF::update_other_samples_of_this_record(VCFRecord* reference_record) and other_record->sampleIndex_to_sampleInfo[sample_index] .get_gt_from_max_likelihood_path() == 0) { - // cout << "update my record to have ref allele also for sample " << - // sample_index << endl; cout << records[record_index] << endl; reference_record->sampleIndex_to_sampleInfo[sample_index] .set_gt_from_max_likelihood_path(0); } @@ -195,7 +184,6 @@ void VCF::set_sample_gt_to_ref_allele_for_records_in_the_interval( if (record->ref_allele_is_inside_given_interval(chrom, pos_from, pos_to)) { record->sampleIndex_to_sampleInfo[sample_index] .set_gt_from_max_likelihood_path(0); - // cout << "update record " << records[i] << endl; } } } @@ -316,13 +304,6 @@ void VCF::merge_multi_allelic_core(VCF& merged_VCF, uint32_t max_allele_length) .empty()) { assert_msg("VCF::merge_multi_allelic: vcf_record_to_be_merged_in " "has no samples"); - /* - records.at(current_vcf_record_position)->clear(); - records.at(previous_vcf_record_position)->clear(); - merged_VCF.add_record_core(vcf_record_merged); - previous_vcf_record_position = records.size() - 1; - vcf_record_merged = *(records.at(previous_vcf_record_position)); - */ } vcf_record_merged->merge_record_into_this( *vcf_record_to_be_merged_in_pointer); From 23328483e1f31488cc71d32c27b16da0bce3d0cd Mon Sep 17 00:00:00 2001 From: Michael Hall Date: Thu, 29 Oct 2020 14:20:39 +1000 Subject: [PATCH 6/9] update readme and type name for discover-k --- README.md | 5 +++-- src/denovo_discovery/discover_main.cpp | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e9115ac8..d0f02230 100644 --- a/README.md +++ b/README.md @@ -261,7 +261,7 @@ This will look for regions in the pangraph where the reads do not map and attemp ``` $ pandora discover --help Quasi-map reads to an indexed PRG, infer the sequence of present loci in the sample and discover novel variants. -Usage: ./pandora discover [OPTIONS] +Usage: pandora discover [OPTIONS] Positionals: FILE [required] An indexed PRG file (in fasta format) @@ -269,7 +269,7 @@ Positionals: Options: -h,--help Print this help message and exit - --discover-k INT K-mer size to use when discovering novel variants [default: 11] + --discover-k INT:[0-32) K-mer size to use when discovering novel variants [default: 11] --max-ins INT Max. insertion size for novel variants. Warning: setting too long may impair performance [default: 15] --covg-threshold INT Positions with coverage less than this will be tagged for variant discovery [default: 3] -l INT Min. length of consecutive positions below coverage threshold to trigger variant discovery [default: 1] @@ -303,6 +303,7 @@ Preset: Filtering: --clean Add a step to clean and detangle the pangraph + --clean-dbg Clean the local assembly de Bruijn graph --max-covg INT Maximum coverage of reads to accept [default: 600] Consensus/Variant Calling: diff --git a/src/denovo_discovery/discover_main.cpp b/src/denovo_discovery/discover_main.cpp index f2c6788a..52976ef6 100644 --- a/src/denovo_discovery/discover_main.cpp +++ b/src/denovo_discovery/discover_main.cpp @@ -112,7 +112,8 @@ void setup_discover_subcommand(CLI::App& app) ->add_option("--discover-k", opt->denovo_kmer_size, "K-mer size to use when discovering novel variants") ->capture_default_str() - ->check(CLI::Range(0, MAX_DENOVO_K)) + ->check(CLI::Range(0, MAX_DENOVO_K) + .description("[0-" + std::to_string(MAX_DENOVO_K) + ")")) ->type_name("INT"); std::string description = "Max. insertion size for novel variants. Warning: " From 8988378250f05f8e0edd19a772cf316e0931c3ac Mon Sep 17 00:00:00 2001 From: Michael Hall Date: Thu, 29 Oct 2020 16:17:20 +1000 Subject: [PATCH 7/9] add build and test stage to travis change boost install method remove travis_retry change boost build dir add boost root to cmake WIP: trying to get cmake building WIP: dont rely on previous CMAKE_OPTIONS variable WIP: add missing boost variables to script WIP: fiddling with boost root WIP: comment out cache WIP: try and make boost install simpler WIP: trying to get pandora to include boost WIP: try to install boost in /usr WIP: add sudo WIP: fix ctest verbose flag WIP: try and cache boost WIP: format readme and update travis badges WIP: remove caching of /usr dirs --- .travis.yml | 41 +++++++++----- README.md | 149 +++++++++++++++++++++++++++++++++++++++----------- ci/install.sh | 17 ++++++ ci/script.sh | 12 ++++ 4 files changed, 172 insertions(+), 47 deletions(-) create mode 100644 ci/install.sh create mode 100644 ci/script.sh diff --git a/.travis.yml b/.travis.yml index 20a626ca..00b7e9b3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,20 @@ language: cpp compiler: gcc os: linux dist: bionic +env: + - OMP_NUM_THREADS=4 # https://docs.travis-ci.com/user/languages/cpp/#openmp-projects + +git: + depth: 3 # https://docs.travis-ci.com/user/customizing-the-build/#git-clone-depth + +cache: + directories: + - ${TRAVIS_BUILD_DIR}/deps/boost-1.62.0 + +branches: + only: # build all branches https://docs.travis-ci.com/user/customizing-the-build/#safelisting-or-blocklisting-branches + - gh-pages + - /.*/ include: - os: linux @@ -9,28 +23,27 @@ include: apt: packages: - clang-format-8 + - zlib1g-dev jobs: + fast_finish: true include: - - stage: "Lint" # naming the Tests stage - name: "Format" # names the first Tests stage job + - stage: "Format" script: - find src/ include/ test/ -type f \( -iname \*.h -o -iname \*.cpp \) | xargs -I _ clang-format -style=file -output-replacements-xml _ | grep -c "/dev/null - if [ $? -ne 1 ]; then echo "Not all source and header files are formatted with clang-format"; exit 1; fi - - stage: "Build" - script: echo "This is where we build Pandora" - - stage: "Test" - name: "Unit tests" - script: echo "This is where we run the unit tests" - - stage: "Deploy" - name: "DockerHub" - script: echo "This is where we deploy a container to Docker Hub" + - stage: "Build and Test" + env: + - BOOST_VERSION="1.62.0" + - BOOST_LIBS="system,filesystem,iostreams,log,thread,date_time" + - BUILD_TYPE="Debug" + - DEPS_DIR="${TRAVIS_BUILD_DIR}/deps" + install: bash ci/install.sh + script: bash ci/script.sh stages: - - "Lint" - - "Build" - - "Test" - - "Deploy" + - "Format" + - "Build and Test" notifications: email: false diff --git a/README.md b/README.md index d0f02230..0722b151 100644 --- a/README.md +++ b/README.md @@ -1,49 +1,89 @@ -[![Build Status](https://travis-ci.org/rmcolq/pandora.svg?branch=master)](https://travis-ci.org/rmcolq/pandora) master +| Branch | Status | +|:-------------------|:---------------------------------------------------------------------------------| +| [`master`][master] | ![Travis (.com) branch](https://img.shields.io/travis/com/rmcolq/pandora/master) | +| [`dev`][dev] | ![Travis (.com) branch](https://img.shields.io/travis/com/rmcolq/pandora/dev) | -[![Build Status](https://travis-ci.com/rmcolq/pandora.svg?token=mxzxNwUzHrkcpsL2i7zU&branch=dev)](https://travis-ci.com/rmcolq/pandora) dev +[master]: https://github.com/rmcolq/pandora/tree/master +[dev]: https://github.com/rmcolq/pandora/tree/dev -![Docker Cloud Build Status](https://img.shields.io/docker/cloud/build/rmcolq/pandora) # Pandora -## Contents -* [Introduction](#introduction) -* [Quick Start](#quick-start) -* [Installation](#installation) -* [Usage](#usage) +[TOC]: # + +# Table of Contents +- [Introduction](#introduction) +- [Quick Start](#quick-start) +- [Installation](#installation) + - [Containers](#containers) + - [Installation from source](#installation-from-source) +- [Usage](#usage) + - [Population Reference Graphs](#population-reference-graphs) + - [Build index](#build-index) + - [Map reads to index](#map-reads-to-index) + - [Compare reads from several samples](#compare-reads-from-several-samples) + - [Discover novel variants](#discover-novel-variants) + ## Introduction -Pandora is a tool for bacterial genome analysis using a pangenome reference graph (PanRG). It allows gene presence/absence detection and genotyping of SNPs, indels and longer variants in one or a number of samples. Pandora works with Illumina or Nanopore data. Core ideas behind the method are: - - new genomes look like recombinants (plus mutations) of things seen before - - we should be analysing nucleotide-level variation everywhere, not just in core genes - - arbitrary single reference genomes are unnatural and limit comparisons of diverse sets of genomes -The pangenome reference graph (PanRG) is a collection of 'floating' local graphs, each representing some orthologous region of interest (e.g. genes, mobile elements or intergenic regions). See https://github.com/rmcolq/make_prg for a pipeline which can construct these PRGs from a set of aligned sequence files. +Pandora is a tool for bacterial genome analysis using a pangenome +reference graph (PanRG). It allows gene presence/absence detection and +genotyping of SNPs, indels and longer variants in one or a number of +samples. Pandora works with Illumina or Nanopore data. Core ideas behind +the method are: +- new genomes look like recombinants (plus mutations) of things seen + before +- we should be analysing nucleotide-level variation everywhere, not just + in core genes +- arbitrary single reference genomes are unnatural and limit comparisons + of diverse sets of genomes + +The pangenome reference graph (PanRG) is a collection of 'floating' +local graphs, each representing some orthologous region of interest +(e.g. genes, mobile elements or intergenic regions). See +https://github.com/rmcolq/make_prg for a pipeline which can construct +these PRGs from a set of aligned sequence files. Pandora can do the following for a single sample (read dataset): -- Output inferred mosaic of reference sequences for loci (eg genes) from the PanRG which are present -- Output a VCF showing the variation found within these loci, with respect to any reference path in the PRG. +- Output inferred mosaic of reference sequences for loci (eg genes) from + the PanRG which are present +- Output a VCF showing the variation found within these loci, with + respect to any reference path in the PRG. Soon, in a galaxy not so far away, it will allow: - discovery of new variation not in the PRG For a collection of samples, it can: -- Output a matrix showing inferred copy-number of each locus in each sample genome -- Output a multisample pangenome VCF showing how including genotype calls for each sample in each of the loci -- Output one VCF per orthologous-chunk, showing how samples which contained this chunk differed in their gene sequence. Variation is shown with respect to the most informative recombinant path in the PRG. +- Output a matrix showing inferred copy-number of each locus in each + sample genome +- Output a multisample pangenome VCF showing how including genotype + calls for each sample in each of the loci +- Output one VCF per orthologous-chunk, showing how samples which + contained this chunk differed in their gene sequence. Variation is + shown with respect to the most informative recombinant path in the + PRG. Warning - this code is still in development. ## Quick Start + Index PanRG file: + ``` pandora index -t 8 ``` -Compare first 30X of each Illumina sample to get pangenome matrix and VCF + +Compare first 30X of each Illumina sample to get pangenome matrix and +VCF + ``` pandora compare --genotype --illumina --max-covg 30 ``` -Map Nanopore reads from a single sample to get approximate sequence for genes present + +Map Nanopore reads from a single sample to get approximate sequence for +genes present + ``` pandora map ``` @@ -51,21 +91,34 @@ pandora map ## Installation ### Containers -We highly recommend that you download a containerized image of Pandora. Pandora is hosted on Dockerhub and images can be downloaded with the command: + +![Docker Cloud Build Status](https://img.shields.io/docker/cloud/build/rmcolq/pandora) + +We highly recommend that you download a containerized image of Pandora. +Pandora is hosted on Dockerhub and images can be downloaded with the +command: + ``` docker pull rmcolq/pandora:latest ``` + Alternatively, using singularity: + ``` singularity pull docker://rmcolq/pandora:latest ``` + NB For consistency, we no longer maintain images on singularity hub. ### Installation from source -This is not recommended because the required zlib and boost system installs do not always play nicely. -If you want to take the risk: + +This is not recommended because the required zlib and boost system +installs do not always play nicely. If you want to take the risk: - Requires a Unix or Mac OS. -- Requires a system install of `zlib`. If this is not already installed, [this](https://geeksww.com/tutorials/libraries/zlib/installation/installing_zlib_on_ubuntu_linux.php) tutorial is helpful or try the following. +- Requires a system install of `zlib`. If this is not already installed, + [this](https://geeksww.com/tutorials/libraries/zlib/installation/installing_zlib_on_ubuntu_linux.php) + tutorial is helpful or try the following. + ``` wget http://www.zlib.net/zlib-1.2.11.tar.gz -O - | tar xzf - cd zlib-1.2.11 @@ -73,14 +126,23 @@ cd zlib-1.2.11 make make install ``` -- Requires a system installation of `boost` containing the `system`, `filesystem`, `log` (which also depends on `thread` and `date_time`) and `iostreams` libraries. If not already installed use the following or look at [this](https://www.boost.org/doc/libs/1_62_0/more/getting_started/unix-variants.html) guide. + +- Requires a system installation of `boost` containing the `system`, + `filesystem`, `log` (which also depends on `thread` and `date_time`) + and `iostreams` libraries. If not already installed use the following + or look at + [this](https://www.boost.org/doc/libs/1_62_0/more/getting_started/unix-variants.html) + guide. + ``` wget https://sourceforge.net/projects/boost/files/boost/1.62.0/boost_1_62_0.tar.gz -O - | tar xzf - cd boost_1_62_0 ./bootstrap.sh [--prefix=/prefix/path] --with-libraries=system,filesystem,iostreams,log,thread,date_time ./b2 install ``` + - Download and install `pandora` as follows: + ``` git clone --single-branch https://github.com/rmcolq/pandora.git --recursive cd pandora @@ -113,11 +175,20 @@ Subcommands: ``` ### Population Reference Graphs -Pandora assumes you have already constructed a fasta-like file of graphs, one entry for each gene/ genome region of interest. -If you haven't, you will need a multiple sequence alignment for each graph. Precompiled collections of MSA representing othologous gene clusters for a number of species can be downloaded from [here](http://pangenome.de/) and converted to graphs using the pipeline from [here](https://github.com/rmcolq/make_prg). + +Pandora assumes you have already constructed a fasta-like file of +graphs, one entry for each gene/ genome region of interest. If you +haven't, you will need a multiple sequence alignment for each graph. +Precompiled collections of MSA representing othologous gene clusters for +a number of species can be downloaded from [here](http://pangenome.de/) +and converted to graphs using the pipeline from +[here](https://github.com/rmcolq/make_prg). ### Build index -Takes a fasta-like file of PanRG sequences and constructs an index, and a directory of gfa files to be used by `pandora map` or `pandora compare`. These are output in the same directory as the PanRG file. + +Takes a fasta-like file of PanRG sequences and constructs an index, and +a directory of gfa files to be used by `pandora map` or `pandora +compare`. These are output in the same directory as the PanRG file. ``` $ pandora index --help @@ -136,10 +207,15 @@ Options: -v Verbosity of logging. Repeat for increased verbosity ``` -The index stores (w,k)-minimizers for each PanRG path found. These parameters can be specified, but default to w=14, k=15. +The index stores (w,k)-minimizers for each PanRG path found. These +parameters can be specified, but default to w=14, k=15. ### Map reads to index -This takes a fasta/q of Nanopore or Illumina reads and compares to the index. It infers which of the PanRG genes/elements is present, and for those that are present it outputs the inferred sequence and a genotyped VCF. + +This takes a fasta/q of Nanopore or Illumina reads and compares to the +index. It infers which of the PanRG genes/elements is present, and for +those that are present it outputs the inferred sequence and a genotyped +VCF. ``` $ pandora map --help @@ -199,7 +275,12 @@ Genotyping: ``` ### Compare reads from several samples -This takes Nanopore or Illumina read fasta/q for a number of samples, mapping each to the index. It infers which of the PanRG genes/elements is present in each sample, and outputs a presence/absence pangenome matrix, the inferred sequences for each sample and a genotyped multisample pangenome VCF. + +This takes Nanopore or Illumina read fasta/q for a number of samples, +mapping each to the index. It infers which of the PanRG genes/elements +is present in each sample, and outputs a presence/absence pangenome +matrix, the inferred sequences for each sample and a genotyped +multisample pangenome VCF. ``` $ pandora compare --help @@ -256,7 +337,8 @@ Genotyping: ### Discover novel variants -This will look for regions in the pangraph where the reads do not map and attempt to locally assemble these regions to find novel variants. +This will look for regions in the pangraph where the reads do not map +and attempt to locally assemble these regions to find novel variants. ``` $ pandora discover --help @@ -308,4 +390,5 @@ Filtering: Consensus/Variant Calling: --kmer-avg INT Maximum number of kmers to average over when selecting the maximum likelihood path [default: 100] -``` \ No newline at end of file +``` + diff --git a/ci/install.sh b/ci/install.sh new file mode 100644 index 00000000..f473a7bf --- /dev/null +++ b/ci/install.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -evu + +BOOST_URL="http://sourceforge.net/projects/boost/files/boost/${BOOST_VERSION}/boost_${BOOST_VERSION//\./_}.tar.gz" +BOOST_DIR="${DEPS_DIR}/boost-${BOOST_VERSION}" +BOOST_ROOT="/usr" +export BOOST_DIR +export BOOST_ROOT + +# we cache this boost directory so we don't build it every time +if [[ -z "$(ls -A "${BOOST_DIR}")" ]]; then + mkdir -p "${BOOST_DIR}" + { wget --quiet -O - "${BOOST_URL}" | tar --strip-components=1 -xz -C "${BOOST_DIR}"; } || exit 1 +fi + +cd "$BOOST_DIR" || exit 1 +{ sudo ./bootstrap.sh --with-libraries="$BOOST_LIBS" --prefix="$BOOST_ROOT" && sudo ./b2 install; } || exit 1 diff --git a/ci/script.sh b/ci/script.sh new file mode 100644 index 00000000..f2ca0134 --- /dev/null +++ b/ci/script.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -evu + +export CMAKE_OPTIONS="-DCMAKE_BUILD_TYPE=$BUILD_TYPE " + +echo "$CMAKE_OPTIONS" +mkdir build +cd build || exit 1 +{ cmake "$CMAKE_OPTIONS" .. && make -j4; } || exit 1 +export PATH="${PWD}:${PATH}" +./pandora --help +ctest -V || exit 1 From abc2508e050f4e72ab98c5f0652c237f47052170 Mon Sep 17 00:00:00 2001 From: Michael Hall Date: Fri, 30 Oct 2020 16:58:22 +1000 Subject: [PATCH 8/9] change default merge_dist to 15 --- README.md | 2 +- include/denovo_discovery/discover_main.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0722b151..44e0bf79 100644 --- a/README.md +++ b/README.md @@ -357,7 +357,7 @@ Options: -l INT Min. length of consecutive positions below coverage threshold to trigger variant discovery [default: 1] -L INT Max. length of consecutive positions below coverage threshold to trigger variant discovery [default: 50] -P,--pad INT Padding either side of candidate variant intervals [default: 22] - -d,--merge INT Merge candidate variant intervals within distance [default: 22] + -d,--merge INT Merge candidate variant intervals within distance [default: 15] --min-dbg-dp INT Minimum node/kmer depth in the de Bruijn graph used for discovering variants [default: 2] -v Verbosity of logging. Repeat for increased verbosity diff --git a/include/denovo_discovery/discover_main.h b/include/denovo_discovery/discover_main.h index e797a069..614f912c 100644 --- a/include/denovo_discovery/discover_main.h +++ b/include/denovo_discovery/discover_main.h @@ -41,7 +41,7 @@ struct DiscoverOptions { uint32_t min_candidate_len { 1 }; uint32_t max_candidate_len { 50 }; uint16_t candidate_padding { 22 }; - uint32_t merge_dist { 22 }; + uint32_t merge_dist { 15 }; uint32_t min_cluster_size { 10 }; uint32_t max_num_kmers_to_avg { 100 }; bool clean_dbg { false }; From f39d8c51e2b6ef3237d3beab801a03e995431a61 Mon Sep 17 00:00:00 2001 From: Michael Hall Date: Mon, 2 Nov 2020 15:46:20 +1000 Subject: [PATCH 9/9] make clean asm graph member variable const --- include/denovo_discovery/denovo_discovery.h | 2 +- test/denovo_discovery/denovo_discovery_test.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/denovo_discovery/denovo_discovery.h b/include/denovo_discovery/denovo_discovery.h index fbaf49b1..8c8431a0 100644 --- a/include/denovo_discovery/denovo_discovery.h +++ b/include/denovo_discovery/denovo_discovery.h @@ -20,7 +20,7 @@ class DenovoDiscovery { public: const uint16_t max_insertion_size; const uint16_t min_covg_for_node_in_assembly_graph; - bool clean_assembly_graph; + const bool clean_assembly_graph; DenovoDiscovery(const uint16_t& kmer_size, const double& read_error_rate, uint16_t max_insertion_size = 15, diff --git a/test/denovo_discovery/denovo_discovery_test.cpp b/test/denovo_discovery/denovo_discovery_test.cpp index 462e9901..661ecb79 100644 --- a/test/denovo_discovery/denovo_discovery_test.cpp +++ b/test/denovo_discovery/denovo_discovery_test.cpp @@ -211,10 +211,10 @@ TEST(FindPathsThroughCandidateRegionTest, endKmerExistsInStartKmersFindPathAndCy TEST(FindPathsThroughCandidateRegionTest, doGraphCleaningtwoIdenticalReadsPlusNoiseReturnOnePath) { + const auto clean { true }; const int k { 9 }; const double error_rate { 0.11 }; - DenovoDiscovery denovo { k, error_rate }; - denovo.clean_assembly_graph = true; + DenovoDiscovery denovo { k, error_rate, 15, 1, clean }; CandidateRegion candidate_region { Interval(0, 1), "test" }; candidate_region.max_likelihood_sequence = "ATGCGCTGAGAGTCGGACT"; candidate_region.pileup