diff --git a/CMakeLists.txt b/CMakeLists.txt index 94e5eaf..bf74ba5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -198,48 +198,19 @@ else() set_target_properties(alignment-writer PROPERTIES EXCLUDE_FROM_ALL 1) set(CMAKE_ALIGNMENT_WRITER_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/alignment-writer/include) set(CMAKE_ALIGNMENT_WRITER_LIBRARY ${CMAKE_CURRENT_BINARY_DIR}/lib/libalignment-writer.a) + set(CMAKE_BITMAGIC_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/alignment-writer/external/BitMagic-7.12.3/src) endif() include_directories(${CMAKE_ALIGNMENT_WRITER_HEADERS}) target_link_libraries(mGEMS ${CMAKE_ALIGNMENT_WRITER_LIBRARY}) target_link_libraries(libmgems ${CMAKE_ALIGNMENT_WRITER_LIBRARY}) -## telescope -if (DEFINED CMAKE_TELESCOPE_LIBRARY AND DEFINED CMAKE_TELESCOPE_HEADERS) - message(STATUS "telescope headers provided in: ${CMAKE_TELESCOPE_HEADERS}") - message(STATUS "telescope library provided in: ${CMAKE_TELESCOPE_LIBRARY}") +## BitMagic +if (DEFINED CMAKE_BITMAGIC_HEADERS) + message(STATUS "BitMagic headers provided in: ${CMAKE_BITMAGIC_HEADERS}") else() - FetchContent_Declare(telescope - GIT_REPOSITORY https://github.com/tmaklin/telescope.git - GIT_TAG v0.7.0-prerelease - PREFIX "external" - SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external/telescope" - BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/telescope" - BUILD_IN_SOURCE 0 - CMAKE_ARGS -D CMAKE_BXZSTR_HEADERS=${CMAKE_BXZSTR_HEADERS} - -D CMAKE_CXXARGS_HEADERS=${CMAKE_CXXARGS_HEADERS} - -D CMAKE_CXXIO_HEADERS=${CMAKE_CXXIO_HEADERS} - -D CMAKE_ALIGNMENT_WRITER_HEADERS=${CMAKE_ALIGNMENT_WRITER_HEADERS} - -D CMAKE_ALIGNMENT_WRITER_LIBRARY=${CMAKE_ALIGNMENT_WRITER_LIBRARY} - -D CMAKE_BITMAGIC_HEADERS=${CMAKE_CURRENT_SOURCE_DIR}/external/telescope/external/BitMagic-7.12.3/src - -D CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -D "CMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - -D "CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" - -D "CMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - INSTALL_COMMAND "" - ) - FetchContent_MakeAvailable(telescope) - add_dependencies(mGEMS libalignmentwriter) - add_dependencies(telescope libalignmentwriter) - add_dependencies(mGEMS telescope) - set_target_properties(telescope PROPERTIES EXCLUDE_FROM_ALL 1) - set(CMAKE_TELESCOPE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/telescope/include) - set(CMAKE_TELESCOPE_LIBRARY ${CMAKE_CURRENT_BINARY_DIR}/lib/libtelescope.a) - set(CMAKE_BITMAGIC_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/external/telescope/external/BitMagic-7.12.3/src) + message(FATAL_ERROR "Provide BitMagic C++ headers with -DCMAKE_BITMAGIC_HEADERS") endif() -include_directories(${CMAKE_TELESCOPE_HEADERS} ${CMAKE_BITMAGIC_HEADERS}) -target_link_libraries(mGEMS ${CMAKE_TELESCOPE_LIBRARY}) -target_link_libraries(libmgems ${CMAKE_TELESCOPE_LIBRARY}) +include_directories(${CMAKE_BITMAGIC_HEADERS}) ## seamat if (DEFINED CMAKE_SEAMAT_HEADERS) diff --git a/include/bin_reads.h b/include/bin_reads.h index c91549d..0a483d0 100644 --- a/include/bin_reads.h +++ b/include/bin_reads.h @@ -5,9 +5,10 @@ #include #include -#include "telescope.hpp" #include "Matrix.hpp" +#include "mGEMS_alignment.hpp" + namespace mGEMS { void FilterTargetGroups(const std::vector &group_names, const std::vector &abundances, @@ -37,7 +38,7 @@ void WriteBin(const std::vector &binned_reads, std::ostream &of); // `aln`: Themisto pseudoalignments. // `of`: Stream for the output. void WriteAssignments(const std::vector> &assignments_mat, - const telescope::ThemistoAlignment &aln, std::ostream &of); + const mGEMS::Alignment &aln, std::ostream &of); // mGEMS::Bin // Returns a 2D vector that contains the ids (line numbers in the @@ -55,7 +56,7 @@ void WriteAssignments(const std::vector> &assignments_mat, // `*unassigned_bin`: Vector containing the ids of reads that were not assigned to any bin. // `out_bins`: Vector containing the bins for the groups given in `*target_groups`. // `*assignments_mat`: The read assignment matrix from AssignProbs. -std::vector> Bin(const telescope::ThemistoAlignment &aln, +std::vector> Bin(const mGEMS::Alignment &aln, const std::vector &abundances, const long double theta_frac, const bool single_only, diff --git a/include/mGEMS_alignment.hpp b/include/mGEMS_alignment.hpp new file mode 100644 index 0000000..91bfc2a --- /dev/null +++ b/include/mGEMS_alignment.hpp @@ -0,0 +1,236 @@ +// mGEMS: Estimate abundances of reference lineages in DNA sequencing reads. +// +// MIT License +// +// Copyright (c) 2024 Probabilistic Inference and Computational Biology group @ UH +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +#ifndef MGEMS_ALIGNMENT_HPP +#define MGEMS_ALIGNMENT_HPP + +#include "bm64.h" +#include "unpack.hpp" + +#include "mGEMS_openmp_config.hpp" + +#include +#include +#include +#include + +namespace mGEMS { +class Alignment { +private: + size_t n_targets; + size_t n_queries; + std::vector group_indicators; + size_t n_groups; + std::vector> ec_read_ids; + std::vector ec_counts; + bm::bvector<> bits; + +public: + Alignment(size_t _n_targets) { + this->n_targets = _n_targets; + } + + void ReadPlaintextLine(const size_t n_targets, std::string &line, bm::bvector<>::bulk_insert_iterator &it) { + std::string part; + std::stringstream partition(line); + + // First column is read id (0-based indexing). + std::getline(partition, part, ' '); + size_t read_id = std::stoul(part); + + // Next columns contain the target sequence id (0-based indexing). + while (std::getline(partition, part, ' ')) { + *it = read_id*n_targets + std::stoul(part); // set bit `n_reads*n_refs + std::stoul(part)` as true + } + } + + size_t ReadPlaintextAlignment(const size_t n_targets, std::string &line, std::istream *stream, bm::bvector<> *ec_configs) { + bm::bvector<>::bulk_insert_iterator it(*ec_configs); // Bulk insert iterator buffers the insertions + + size_t n_reads = 1; + try { + // Contents of the first line is already stored in `line` + ReadPlaintextLine(n_targets, line, it); + + size_t compress_interval = 1000000; + while (std::getline(*stream, line)) { + // Insert each line into the alignment + ReadPlaintextLine(n_targets, line, it); + ++n_reads; + if (n_reads % compress_interval == 0) { + ec_configs->optimize(); + } + } + } catch (const std::exception &e) { + std::string msg(e.what()); + if (msg.find("stoul") != std::string::npos) { + throw std::runtime_error("File format not supported on line " + std::to_string(n_reads) + " with content: " + line); + } else { + throw std::runtime_error("Could not parse line " + std::to_string(n_reads) + " with content: " + line); + } + } + return n_reads; + } + + + void read(const std::string &merge_mode, std::vector &strands) { + for (size_t i = 0; i < strands.size(); ++i) { + std::string line; + std::getline(*strands[i], line); // Read the first line to check the format + size_t n_reads; + bm::bvector<> strand_alignment; + if (line.find(',') != std::string::npos) { + // First line contains a ','; stream could be in the compact format. + size_t n_refs; + alignment_writer::ReadHeader(line, &n_reads, &n_refs); + if (n_refs > this->n_targets) { + throw std::runtime_error("Pseudoalignment file has more target sequences than expected."); + } else if (this->n_targets < n_refs) { + throw std::runtime_error("Pseudoalignment file has less target sequences than expected."); + } + // Size is given on the header line. + strand_alignment.resize(n_reads*n_refs); + alignment_writer::UnpackData(strands[i], strand_alignment); + } else { + // Stream could be in the plaintext format. + // Size is unknown. + strand_alignment.set_new_blocks_strat(bm::BM_GAP); + n_reads = ReadPlaintextAlignment(n_targets, line, strands[i], &strand_alignment); + } + this->n_queries = n_reads; + + if (i == 0) { + this->bits = std::move(strand_alignment); + } else { + if (merge_mode == "intersection") { + this->bits.bit_and(strand_alignment); + } else if (merge_mode == "union") { + this->bits.bit_or(strand_alignment); + } else { + throw std::runtime_error("Unrecognized option `" + merge_mode + "` for --themisto-mode"); + } + } + } + } + + void collapse() { + size_t n_threads = 1; +#if defined(MGEMS_OPENMP_SUPPORT) && (MGEMS_OPENMP_SUPPORT) == 1 +#pragma omp parallel + { + n_threads = omp_get_num_threads(); + } +#endif + + std::vector>> mymap(n_threads); +#pragma omp parallel for schedule(static) + for (size_t i = 0; i < this->n_queries; ++i) { + if (bits.any_range(i*this->n_targets, (i + 1)*this->n_targets - 1)) { + size_t hash = 0; + for (size_t j = 0; j < this->n_targets; ++j) { + if (this->bits[i*this->n_targets + j]) { + hash ^= j + 0x517cc1b727220a95 + (hash << 6) + (hash >> 2); + } + } + auto got = mymap[omp_get_thread_num()].find(hash); + if (got == mymap[omp_get_thread_num()].end()) { + mymap[omp_get_thread_num()].insert(std::make_pair(hash, std::vector({(uint32_t)i}))); + } else { + got->second.emplace_back(i); + } + } + } + + std::map> map; + for (size_t i = 0; i < n_threads; ++i) { + for (auto kv : mymap[i]) { + auto got = map.find(kv.first); + if (got == map.end()) { + map.insert(std::make_pair(kv.first, std::move(kv.second))); + } else { + got->second.insert(got->second.end(), + std::make_move_iterator(kv.second.begin()), + std::make_move_iterator(kv.second.end())); + } + } + } + + size_t n_ecs = map.size(); + this->ec_counts = std::vector(n_ecs, 0); + this->ec_read_ids = std::vector>(n_ecs); + size_t i = 0; + std::vector> collapsed_bits(n_threads); + +#pragma omp parallel + { + size_t i = 0; + size_t thread_id = omp_get_thread_num(); + collapsed_bits[thread_id].resize(n_ecs*this->n_targets); + for(auto element = map.begin(); element !=map.end(); ++element, i++) { + if(i%n_threads == thread_id) { + this->ec_read_ids[i] = std::move(element->second); + this->ec_counts[i] = this->ec_read_ids[i].size(); + for (size_t k = 0; k < this->n_targets; ++k) { + collapsed_bits[thread_id][i*this->n_targets + k] = this->bits[this->ec_read_ids[i][0]*this->n_targets + k]; + } + } + } + } + + this->bits.clear(); + for (size_t i = 0; i < n_threads; ++i) { + this->bits.bit_or(collapsed_bits[i]); + } + } + + size_t n_ecs() const { return this->ec_counts.size(); }; + size_t n_reads() const { return this->n_queries; }; + size_t reads_in_ec(const size_t i) const { return this->ec_counts[i]; }; + size_t get_n_targets() const { return this->n_targets; }; + + bool operator()(const size_t row, const size_t col) const { return this->bits[row*this->n_targets + col]; } + + const std::vector>& get_aligned_reads() const { + return this->ec_read_ids; + } + + const std::vector& reads_assigned_to_ec(size_t ec_id) const { return this->ec_read_ids[ec_id]; } + + template + void add_groups(const std::vector &grouping) { + size_t _n_groups = 0; + this->group_indicators = std::vector(grouping.size()); + for (size_t i = 0; i < grouping.size(); ++i) { + group_indicators[i] = grouping[i]; + _n_groups = (n_groups > grouping[i] ? n_groups : grouping[i]); + } + this->n_groups = _n_groups + 1; + } + + const std::vector& get_groups() const { return this->group_indicators; }; + +}; +} + +#endif diff --git a/src/bin_reads.cpp b/src/bin_reads.cpp index 299ddef..4253e31 100644 --- a/src/bin_reads.cpp +++ b/src/bin_reads.cpp @@ -84,7 +84,7 @@ bool EvaluateAssignment(const double abundance, const double threshold, uint32_t return assigned; } -void InsertAssigned(const size_t ec_id, const bool single_only, const size_t n_assignments, const telescope::ThemistoAlignment &alignment, const std::vector &mask, std::vector> *assignments, std::vector> *bins, std::vector *unassigned_bin) { +void InsertAssigned(const size_t ec_id, const bool single_only, const size_t n_assignments, const mGEMS::Alignment &alignment, const std::vector &mask, std::vector> *assignments, std::vector> *bins, std::vector *unassigned_bin) { if (single_only && n_assignments == 1) { for (uint32_t j = 0; j < (*assignments)[ec_id].size(); ++j) { if ((*assignments)[ec_id][j] && mask[j]) { @@ -163,7 +163,7 @@ void AssignProbsMatrix(const std::vector &thresholds, const seamat: } } -void AssignProbs(const std::vector &thresholds, std::istream &probs_file, const std::vector &mask, const telescope::ThemistoAlignment &alignment, const bool single_only, std::vector> *assignments, std::vector> *bins, std::vector *unassigned_bin) { +void AssignProbs(const std::vector &thresholds, std::istream &probs_file, const std::vector &mask, const mGEMS::Alignment &alignment, const bool single_only, std::vector> *assignments, std::vector> *bins, std::vector *unassigned_bin) { // Performs the actual binning based on the precaculated thresholds. // Input: // `thresholds`: The binning thresholds from CalculateThresholds. @@ -203,7 +203,7 @@ void WriteBin(const std::vector &binned_reads, std::ostream &of) { of.flush(); } -void WriteAssignments(const std::vector> &assignments_mat, const telescope::ThemistoAlignment &aln, std::ostream &of) { +void WriteAssignments(const std::vector> &assignments_mat, const mGEMS::Alignment &aln, std::ostream &of) { uint32_t n_groups = assignments_mat[0].size(); uint32_t n_ecs = assignments_mat.size(); for (uint32_t i = 0; i < n_ecs; ++i) { @@ -258,7 +258,7 @@ std::vector> BinFromMatrix(const std::vector> Bin(const telescope::ThemistoAlignment &aln, const std::vector &abundances, const long double theta_frac, const bool single_only, std::istream &probs_file, std::vector *target_groups, std::vector *unassigned_bin, std::vector> *assignments_mat) { +std::vector> Bin(const mGEMS::Alignment &aln, const std::vector &abundances, const long double theta_frac, const bool single_only, std::istream &probs_file, std::vector *target_groups, std::vector *unassigned_bin, std::vector> *assignments_mat) { uint32_t num_ecs = aln.n_ecs(); uint32_t n_groups = abundances.size(); std::vector thresholds(n_groups); diff --git a/src/mGEMS.cpp b/src/mGEMS.cpp index 49a0b9d..a359bfe 100644 --- a/src/mGEMS.cpp +++ b/src/mGEMS.cpp @@ -3,7 +3,6 @@ #include #include -#include "telescope.hpp" #include "cxxargs.hpp" #include "cxxio.hpp" @@ -137,7 +136,9 @@ void Bin(const cxxargs::Arguments &args, bool extract_bins) { n_refs = groups_indicators.count_lines(); groups_indicators.close(); } - telescope::ThemistoAlignment aln = telescope::read::Themisto(telescope::get_mode(args.value("merge-mode")), n_refs, themisto_alns); + mGEMS::Alignment aln(n_refs); + aln.read(args.value("merge-mode"), themisto_alns); + aln.collapse(); cxxio::In probs_file(args.value("probs")); std::vector target_groups;