diff --git a/libs/CMakeLists.txt b/libs/CMakeLists.txt index 8c08d0dd..c705a85b 100644 --- a/libs/CMakeLists.txt +++ b/libs/CMakeLists.txt @@ -17,8 +17,23 @@ if (USE_LIBPLL_CMAKE) ${PLLMOD_SRC}/tree/pll_tree.h ${PLLMOD_SRC}/util/pllmod_util.h ${LIBPLL_SRC}/pll.h + ${PLLMOD_SRC}/dks/benchmark.h + ${PLLMOD_SRC}/dks/dks.h + ${PLLMOD_SRC}/dks/model.h + ${PLLMOD_SRC}/dks/partition.h + ${PLLMOD_SRC}/dks/test_case.h + ${PLLMOD_SRC}/dks/tree.h DESTINATION ${RAXML_LOCALDEPS_DIR}/include/libpll) + # file(COPY ${PLLMOD_SRC}/dks/benchmark.h + # ${PLLMOD_SRC}/dks/dks.h + # ${PLLMOD_SRC}/dks/model.h + # ${PLLMOD_SRC}/dks/msa.h + # ${PLLMOD_SRC}/dks/partition.h + # ${PLLMOD_SRC}/dks/test_case.h + # ${PLLMOD_SRC}/dks/tree.h + # DESTINATION ${RAXML_LOCALDEPS_DIR}/include/libpll/dks) + #target_include_directories(raxml_module PRIVATE ${RAXML_LIBPLL_HEADERS}) else() diff --git a/libs/pll-modules b/libs/pll-modules index 840a1952..186f5dde 160000 --- a/libs/pll-modules +++ b/libs/pll-modules @@ -1 +1 @@ -Subproject commit 840a19525ed133ad243afb23124911b849d4685d +Subproject commit 186f5dde03947eee1f2df7efa8f0ad457abc0b57 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c12f2c5f..cd165f05 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -48,6 +48,7 @@ else() ${RAXML_LOCALDEPS_DIR}/lib/libpll_msa.a ${RAXML_LOCALDEPS_DIR}/lib/libpll_tree.a ${RAXML_LOCALDEPS_DIR}/lib/libpll_util.a + ${RAXML_LOCALDEPS_DIR}/lib/libpll_dks.a ${RAXML_LOCALDEPS_DIR}/lib/libpll.a ) endif() diff --git a/src/CommandLineParser.cpp b/src/CommandLineParser.cpp index 798249c0..da11be71 100644 --- a/src/CommandLineParser.cpp +++ b/src/CommandLineParser.cpp @@ -74,6 +74,7 @@ static struct option long_options[] = {"rf", optional_argument, 0, 0 }, /* 51 */ {"consense", optional_argument, 0, 0 }, /* 52 */ {"ancestral", optional_argument, 0, 0 }, /* 53 */ + {"nodks", no_argument, 0, 0 }, /* 54 */ { 0, 0, 0, 0 } }; @@ -463,6 +464,7 @@ void CommandLineParser::parse_options(int argc, char** argv, Options &opts) } break; case 20: /* SIMD instruction set */ + opts.simd_set = true; if (strcasecmp(optarg, "none") == 0 || strcasecmp(optarg, "scalar") == 0) { opts.simd_arch = PLL_ATTRIB_ARCH_CPU; @@ -834,6 +836,11 @@ void CommandLineParser::parse_options(int argc, char** argv, Options &opts) num_commands++; break; + case 54: + opts.dks_off = true; + break; + + default: throw OptionException("Internal error in option parsing"); } diff --git a/src/Options.hpp b/src/Options.hpp index 4f4f93bc..38d42582 100644 --- a/src/Options.hpp +++ b/src/Options.hpp @@ -45,8 +45,8 @@ class Options bootstop_interval(RAXML_BOOTSTOP_INTERVAL), bootstop_permutations(RAXML_BOOTSTOP_PERMUTES), tbe_naive(false), consense_cutoff(ConsenseCutoff::MR), tree_file(""), constraint_tree_file(""), msa_file(""), model_file(""), outfile_prefix(""), - num_threads(1), num_ranks(1), simd_arch(PLL_ATTRIB_ARCH_CPU), thread_pinning(false), - load_balance_method(LoadBalancing::benoit) + num_threads(1), num_ranks(1), simd_set(false), simd_arch(PLL_ATTRIB_ARCH_CPU), thread_pinning(false), + load_balance_method(LoadBalancing::benoit), dks_off(false) {}; ~Options() = default; @@ -111,9 +111,11 @@ class Options /* parallelization stuff */ unsigned int num_threads; /* number of threads */ unsigned int num_ranks; /* number of MPI ranks */ + bool simd_set; /* did the user specify the simd flag*/ unsigned int simd_arch; /* vector instruction set */ bool thread_pinning; /* pin threads to cores */ LoadBalancing load_balance_method; + bool dks_off; std::string simd_arch_name() const; std::string consense_type_name() const; diff --git a/src/TreeInfo.cpp b/src/TreeInfo.cpp index 3df52846..e2066a7e 100644 --- a/src/TreeInfo.cpp +++ b/src/TreeInfo.cpp @@ -571,55 +571,214 @@ void set_partition_tips(const Options& opts, const MSA& msa, const IDVector& tip pll_set_pattern_weights(partition, comp_weights.data()); } -pll_partition_t* create_pll_partition(const Options& opts, const PartitionInfo& pinfo, - const IDVector& tip_msa_idmap, - const PartitionRange& part_region, const uintVector& weights) -{ - const MSA& msa = pinfo.msa(); - const Model& model = pinfo.model(); +std::vector> create_trimmed_msa(const MSA &msa, + const uintVector &weights, + const IDVector& tip_msa_idmap, + size_t start, size_t length, + size_t trimmed_length) { + std::vector> trimmed_msa; + trimmed_msa.reserve(msa.size()); + + for (size_t tip_id = 0; tip_id < msa.size(); ++tip_id) { + trimmed_msa.emplace_back(trimmed_length); + auto seq_id = tip_msa_idmap.empty() ? tip_id : tip_msa_idmap[tip_id]; + const char *full_seq = msa.at(seq_id).c_str(); + size_t pos = 0; + for (size_t j = start; j < start + length; ++j) { + if (weights[j] > 0) + trimmed_msa[tip_id][pos++] = full_seq[j]; + } + } - /* part_length doesn't include columns with zero weight */ - const size_t part_length = weights.empty() ? part_region.length : - std::count_if(weights.begin() + part_region.start, - weights.begin() + part_region.start + part_region.length, - [](uintVector::value_type w) -> bool - { return w > 0; } - ); + return trimmed_msa; +} - unsigned int attrs = opts.simd_arch; +std::vector> +create_trimmed_prob_msa(const MSA &msa, const IDVector &tip_msa_idmap, + size_t start, size_t length, size_t trimmed_length, + size_t states) { - if (opts.use_rate_scalers && model.num_ratecats() > 1) - { - attrs |= PLL_ATTRIB_RATE_SCALERS; + std::vector> trimmed_msa; + trimmed_msa.reserve(msa.size()); + + auto normalize = !msa.normalized(); + auto weights_iter = msa.weights().cbegin() + start; + + auto clv_size = trimmed_length * states; + for (size_t tip_id = 0; tip_id < msa.size(); ++tip_id) { + trimmed_msa.emplace_back(clv_size); + + auto seq_id = tip_msa_idmap.empty() ? tip_id : tip_msa_idmap[tip_id]; + auto probs = msa.probs(seq_id, start); + + auto clvp = trimmed_msa[tip_id].begin(); + for (size_t i = 0; i < length; ++i) { + if (weights_iter[i] > 0) { + double sum = 0.; + for (size_t j = 0; j < states; ++j) + sum += probs[j]; + + for (size_t j = 0; j < states; ++j) { + if (sum > 0.) + clvp[j] = normalize ? probs[j] / sum : probs[j]; + else + clvp[j] = 1.0; + } + + clvp += states; + } + + /* NB: clv has to be padded, but msa arrays are not! */ + probs += states; + } } - if (opts.use_repeats) - { - assert(!(opts.use_prob_msa)); - attrs |= PLL_ATTRIB_SITE_REPEATS; + return trimmed_msa; +} + +size_t calculate_part_length(const uintVector &weights, + const PartitionRange &part_region) { + return weights.empty() + ? part_region.length + : std::count_if( + weights.begin() + part_region.start, + weights.begin() + part_region.start + part_region.length, + [](uintVector::value_type w) -> bool { return w > 0; }); +} + +uintVector create_trimmed_weights(const uintVector &weights, + size_t part_length) { + uintVector trimmed_weights; + trimmed_weights.reserve(part_length); + for (size_t i = 0; i < weights.size(); i++) { + if (weights[i] > 0) + trimmed_weights.push_back(weights[i]); } - else if (opts.use_tip_inner) - { - assert(!(opts.use_prob_msa)); - // 1) SSE3 tip-inner kernels are not implemented so far, so generic version will be faster - // 2) same for state-rich models - if (opts.simd_arch != PLL_ATTRIB_ARCH_SSE && model.num_states() <= 20) - { - // TODO: use proper auto-tuning - const unsigned long min_len_ti = model.num_states() > 4 ? 40 : 100; - if ((unsigned long) part_length > min_len_ti) - attrs |= PLL_ATTRIB_PATTERN_TIP; + return trimmed_weights; +} + +unsigned int create_pll_partition_attrs(const Options &opts, const MSA &msa, + const Model &model, size_t part_length, + const IDVector &tip_msa_idmap, + const PartitionRange &part_region, + const uintVector &weights) { + unsigned int attrs = 0; + + if (!opts.dks_off) { + LOG_INFO_TS << "Starting DKS" << std::endl; + if (opts.simd_set) { + attrs |= opts.simd_arch; + } + if (opts.use_rate_scalers && model.num_ratecats() > 1) { + attrs |= PLL_ATTRIB_RATE_SCALERS; + } + if (opts.use_repeats) { + assert(!(opts.use_prob_msa)); + attrs |= PLL_ATTRIB_SITE_REPEATS; + } else if (opts.use_tip_inner) { + attrs |= PLL_ATTRIB_PATTERN_TIP; + } + + dks::attributes_generator_t gen; + gen.enable(attrs); + + // disable invalid cpu flags + + unsigned int off_flags = 0; + + if (!PLL_STAT(sse3_present)) + off_flags |= PLL_ATTRIB_ARCH_SSE; + if (!PLL_STAT(avx_present)) + off_flags |= PLL_ATTRIB_ARCH_AVX; + if (!PLL_STAT(avx2_present)) + off_flags |= PLL_ATTRIB_ARCH_AVX2; + + // for now, unconditionally disable avx512 + + off_flags |= PLL_ATTRIB_ARCH_AVX512; + gen.disable(off_flags); + + auto trimmed_weights = create_trimmed_weights(weights, part_length); + if (opts.use_prob_msa && msa.probabilistic()) { + auto trimmed_msa = create_trimmed_prob_msa( + msa, tip_msa_idmap, part_region.start, part_region.length, + part_length, msa.states()); + attrs = + dks::select_kernel_auto(trimmed_msa, trimmed_weights, + model.num_states(), model.num_states(), gen); + } else { + auto trimmed_msa = + create_trimmed_msa(msa, weights, tip_msa_idmap, part_region.start, + part_region.length, part_length); + attrs = + dks::select_kernel_auto(trimmed_msa, trimmed_weights, model.charmap(), + model.num_states(), model.num_states(), gen); + } + LOG_INFO_TS << "DKS Finshed" << std::endl; + } else { + if (opts.use_repeats) { + assert(!(opts.use_prob_msa)); + attrs |= PLL_ATTRIB_SITE_REPEATS; + } else if (opts.use_tip_inner) { + assert(!(opts.use_prob_msa)); + // 1) SSE3 tip-inner kernels are not implemented so far, so generic + // version will be faster 2) same for state-rich models + if (opts.simd_arch != PLL_ATTRIB_ARCH_SSE && model.num_states() <= 20) { + const unsigned long min_len_ti = model.num_states() > 4 ? 40 : 100; + if ((unsigned long)part_length > min_len_ti) + attrs |= PLL_ATTRIB_PATTERN_TIP; + } } } - // NOTE: if partition is split among multiple threads, asc. bias correction must be applied only once! + // NOTE: if partition is split among multiple threads, asc. bias correction + // must be applied only once! if (model.ascbias_type() == AscBiasCorrection::lewis || - (model.ascbias_type() != AscBiasCorrection::none && part_region.master())) - { - attrs |= PLL_ATTRIB_AB_FLAG; - attrs |= (unsigned int) model.ascbias_type(); + (model.ascbias_type() != AscBiasCorrection::none && + part_region.master())) { + attrs |= PLL_ATTRIB_AB_FLAG; + attrs |= (unsigned int)model.ascbias_type(); } + return attrs; +} + +std::vector +create_partition_attr_list(const Options &opts, const PartitionedMSA &msa, + const IDVector &tip_msa_idmap, + const PartitionAssignment& part_assign, + const std::vector &site_weights) { + + std::vector attr_list; + attr_list.reserve(msa.part_count()); + + for (size_t i = 0; i < msa.part_count(); i++) { + const PartitionInfo& pinfo = msa.part_info(i); + auto part_region = *(part_assign.find(i)); + auto weights = site_weights.empty() ? pinfo.msa().weights() : site_weights.at(i); + attr_list.push_back(create_pll_partition_attrs( + opts, + pinfo.msa(), + pinfo.model(), + calculate_part_length(weights, part_region), + tip_msa_idmap, + part_region, + weights)); + + } + return attr_list; +} + +pll_partition_t* create_pll_partition(const Options& opts, const PartitionInfo& pinfo, + const IDVector& tip_msa_idmap, + const PartitionRange& part_region, const uintVector& weights) +{ + const MSA& msa = pinfo.msa(); + const Model& model = pinfo.model(); + + /* part_length doesn't include columns with zero weight */ + const size_t part_length = calculate_part_length(weights, part_region); + unsigned int attrs = create_pll_partition_attrs(opts, msa, model, part_length, tip_msa_idmap, part_region, weights); BasicTree tree(msa.size()); pll_partition_t * partition = pll_partition_create( tree.num_tips(), /* number of tip sequences */ diff --git a/src/common.h b/src/common.h index 0c32cab5..bf0900a0 100644 --- a/src/common.h +++ b/src/common.h @@ -17,6 +17,7 @@ extern "C" { #include #include } +#include #include "types.hpp" #include "constants.hpp" diff --git a/src/main.cpp b/src/main.cpp index b0cc222c..719f1da7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -2202,7 +2202,6 @@ void master_main(RaxmlInstance& instance, CheckpointManager& cm) load_parted_msa(instance); assert(instance.parted_msa); auto& parted_msa = *instance.parted_msa; - load_constraint(instance); check_options(instance);