diff --git a/include/htool/hmatrix/hmatrix.hpp b/include/htool/hmatrix/hmatrix.hpp index ea9d7cf7..0c5a82c6 100644 --- a/include/htool/hmatrix/hmatrix.hpp +++ b/include/htool/hmatrix/hmatrix.hpp @@ -36,17 +36,10 @@ class HMatrix : public TreeNode> m_low_rank_data{nullptr}; // Cached leaves - // std::vector m_dense_leaves{}; - // std::vector m_dense_leaves_in_diagonal_block{}; - // std::vector m_diagonal_dense_leaves{}; - // std::vector m_low_rank_leaves{}; - // std::vector m_low_rank_leaves_in_diagonal_block{}; - // std::vector m_diagonal_low_rank_leaves{}; mutable std::vector m_leaves{}; mutable std::vector m_leaves_for_symmetry{}; mutable char m_symmetry_type_for_leaves{'N'}; mutable char m_UPLO_for_leaves{'N'}; - // std::vector m_leaves_in_diagonal_block{}; StorageType m_storage_type{StorageType::Hierarchical}; @@ -81,10 +74,6 @@ class HMatrix : public TreeNode &target_cluster, const Cluster &source_cluster) : TreeNode>(), m_target_cluster(&target_cluster), m_source_cluster(&source_cluster) { @@ -254,21 +243,6 @@ class HMatrix : public TreeNode &get_infos() const { return infos; } - // std::string get_infos(const std::string &key) const { return infos[key]; } - // void add_info(const std::string &keyname, const std::string &value) const { infos[keyname] = value; } - // void print_infos() const; - // void save_infos(const std::string &outputname, std::ios_base::openmode mode = std::ios_base::app, const std::string &sep = " = ") const; - // underlying_type compressed_size() const; - // double compression_ratio() const; - // double space_saving() const; - // friend underlying_type Frobenius_absolute_error(const HMatrix &B, const VirtualGenerator &A); - - // // // Output structure - // void save_plot(const std::string &outputname) const; - // std::vector get_output() const; - // Data computation void compute_dense_data(const VirtualGenerator &generator) { m_dense_data = std::make_unique>(m_target_cluster->get_size(), m_source_cluster->get_size()); @@ -291,1677 +265,8 @@ class HMatrix : public TreeNodeassign(dense_matrix.nb_rows(), dense_matrix.nb_cols(), dense_matrix.release(), true); m_storage_type = StorageType::Dense; } - - // Linear algebra - void add_vector_product(char trans, CoefficientPrecision alpha, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out) const; - void add_matrix_product_row_major(char trans, CoefficientPrecision alpha, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out, int mu) const; - - // void add_vector_product(char trans, CoefficientPrecision alpha, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out) const; - // void add_matrix_product(CoefficientPrecision alpha, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out, int mu) const; - - // void mvprod_global_to_global(const T *const in, T *const out, const int &mu = 1) const; - // void mvprod_local_to_local(const T *const in, T *const out, const int &mu = 1, T *work = nullptr) const; - - // void mvprod_transp_global_to_global(const T *const in, T *const out, const int &mu = 1) const; - // void mvprod_transp_local_to_local(const T *const in, T *const out, const int &mu = 1, T *work = nullptr) const; - - // void mymvprod_local_to_local(const T *const in, T *const out, const int &mu = 1, T *work = nullptr) const; - // void mymvprod_global_to_local(const T *const in, T *const out, const int &mu = 1) const; - // void mymvprod_transp_local_to_local(const T *const in, T *const out, const int &mu = 1, T *work = nullptr) const; - // void mymvprod_transp_local_to_global(const T *const in, T *const out, const int &mu = 1) const; - - // void mvprod_subrhs(const T *const in, T *const out, const int &mu, const int &offset, const int &size, const int &margin) const; - // std::vector operator*(const std::vector &x) const; - // Matrix operator*(const Matrix &x) const; - - // // Permutations - // void source_to_cluster_permutation(const T *const in, T *const out) const; - // void local_target_to_local_cluster(const T *const in, T *const out, MPI_Comm comm = MPI_COMM_WORLD) const; - // void local_source_to_local_cluster(const T *const in, T *const out, MPI_Comm comm = MPI_COMM_WORLD) const; - // void local_cluster_to_local_target(const T *const in, T *const out, MPI_Comm comm = MPI_COMM_WORLD) const; - // void local_cluster_to_local_source(const T *const in, T *const out, MPI_Comm comm = MPI_COMM_WORLD) const; - - // // local to global - // void local_to_global_source(const T *const in, T *const out, const int &mu) const; - // void local_to_global_target(const T *const in, T *const out, const int &mu) const; - - // // Convert - // Matrix get_local_dense() const; - // Matrix get_local_dense_perm() const; - // void copy_to_dense(CoefficientPrecision *) const; - - // // Apply Dirichlet condition - // void apply_dirichlet(const std::vector &boundary); }; -template -void HMatrix::add_vector_product(char trans, CoefficientPrecision alpha, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out) const { - switch (m_storage_type) { - case StorageType::Dense: - if (m_symmetry == 'N') { - m_dense_data->add_vector_product(trans, alpha, in, beta, out); - } else { - m_dense_data->add_vector_product_symmetric(trans, alpha, in, beta, out, m_UPLO, m_symmetry); - } - break; - case StorageType::LowRank: - m_low_rank_data->add_vector_product(trans, alpha, in, beta, out); - break; - default: - threaded_hierarchical_add_vector_product(trans, alpha, in, beta, out); - break; - } -} - -template -void HMatrix::add_matrix_product_row_major(char trans, CoefficientPrecision alpha, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out, int mu) const { - switch (m_storage_type) { - case StorageType::Dense: - if (m_symmetry == 'N') { - m_dense_data->add_matrix_product_row_major(trans, alpha, in, beta, out, mu); - } else { - m_dense_data->add_matrix_product_symmetric_row_major(trans, alpha, in, beta, out, mu, m_UPLO, m_symmetry); - } - break; - case StorageType::LowRank: - m_low_rank_data->add_matrix_product_row_major(trans, alpha, in, beta, out, mu); - break; - default: - threaded_hierarchical_add_matrix_product_row_major(trans, alpha, in, beta, out, mu); - break; - } -} - -// // build -// template -// void HMatrix::build(VirtualGenerator &mat) { - -// // std::vector mytimes(3), maxtime(3), meantime(3); - -// // // Default compression: sympartialACA -// // if (m_block_tree_properties->m_low_rank_generator == nullptr) { -// // m_block_tree_properties->m_low_rank_generator = std::make_shared>(); -// // } - -// // Build block tree -// bool not_pushed = false; -// // if (m_blockm_block_tree_properties->m_UPLO == 'U' || m_block_tree_properties->m_UPLO == 'L') { -// // not_pushed = build_symmetric_block_tree(); -// // } else { -// not_pushed = build_block_tree(); -// // } -// // for (const std::unique_ptr> &child : m_children) { -// // std::cout << child->get_target_cluster_tree().get_offset() << " " << child->get_target_cluster_tree().get_size() << " " << child->get_source_cluster_tree().get_offset() << " " << child->get_source_cluster_tree().get_size() << "\n"; -// // } - -// reset_root_of_block_tree(); -// if (not_pushed) { -// m_block_tree_properties->m_tasks.push_back(this); -// } - -// // for (const std::unique_ptr> &child : m_children) { -// // std::cout << child->get_target_cluster_tree().get_offset() << " " << child->get_target_cluster_tree().get_size() << " " << child->get_source_cluster_tree().get_offset() << " " << child->get_source_cluster_tree().get_size() << "\n"; -// // } - -// // // Sort local tasks -// // std::sort(m_block_tree_properties->m_tasks.begin(), m_block_tree_properties->m_tasks.end(), [](HMatrix *hmatrix_A, HMatrix *hmatrix_B) { return (hmatrix_A->m_source_cluster_tree->get_offset() == hmatrix_B->m_source_cluster_tree->get_offset()) ? (hmatrix_A->m_target_cluster_tree->get_offset() < hmatrix_B->m_target_cluster_tree->get_offset()) : hmatrix_A->m_source_cluster_tree->get_offset() < hmatrix_B->m_source_cluster_tree->get_offset(); }); - -// // Compute blocks -// compute_blocks(mat); - -// // // Infos -// // ComputeInfos(mytimes); -// } - -// // Symmetry build -// template -// void HMatrix::build(VirtualGenerator &mat, const double *const xt) { - -// MPI_Comm_size(comm, &sizeWorld); -// MPI_Comm_rank(comm, &rankWorld); -// std::vector mytimes(3), maxtime(3), meantime(3); - -// this->nc = mat.nb_cols(); -// this->nr = mat.nb_rows(); -// this->dimension = mam_target_cluster_tree->get_dimension(); - -// // Default compression: sympartialACA -// if (this->LowRankGenerator == nullptr) { -// this->LowRankGenerator = std::make_shared>(); -// } - -// // Default admissibility condition -// if (this->AdmissibilityCondition == nullptr) { -// this->AdmissibilityCondition = std::make_shared(); -// } - -// // Zero generator when we delay the dense computation -// if (delay_dense_computation) { -// zerogenerator = std::unique_ptr>(new ZeroGenerator(mat.nb_rows(), mat.nb_cols(), mam_target_cluster_tree->get_dimension())); -// } - -// // Construction arbre des paquets -// local_size = cluster_tree_t->get_local_size(); -// local_offset = cluster_tree_t->get_local_offset(); - -// // Construction arbre des blocs -// double time = MPI_Wtime(); - -// if (this->OffDiagonalApproximation != nullptr) { -// this->BlockTree.reset(new Block(this->AdmissibilityCondition->get(), cluster_tree_t->get_local_cluster(), cluster_tree_s->get_local_cluster())); -// } else { -// this->BlockTree.reset(new Block(this->AdmissibilityCondition->get(), *cluster_tree_t, *cluster_tree_s)); -// } -// this->BlockTree->set_mintargetdepth(m_mintargetdepth); -// this->BlockTree->set_minsourcedepth(m_minsourcedepth); -// this->BlockTree->set_maxblocksize(this->maxblocksize); -// this->BlockTree->set_eta(this->eta); -// bool force_sym = true; -// this->BlockTree->build(UPLO, force_sym, comm); - -// mytimes[0] = MPI_Wtime() - time; - -// // Assemblage des sous-matrices -// time = MPI_Wtime(); -// ComputeBlocks(mat, xt, xt); -// mytimes[1] = MPI_Wtime() - time; - -// // Infos -// ComputeInfos(mytimes); -// } - -// template -// void HMatrix::build_dense_blocks(VirtualDenseBlocksGenerator &dense_block_generator) { - -// int number_of_dense_blocks = m_block_tree_properties->m_dense_leaves.size(); -// auto &dense_blocks = m_block_tree_properties->m_dense_leaves; - -// std::vector row_sizes(number_of_dense_blocks, 0), col_sizes(number_of_dense_blocks, 0); -// std::vector rows(number_of_dense_blocks, nullptr), cols(number_of_dense_blocks, nullptr); -// std::vector ptr(number_of_dense_blocks, nullptr); - -// for (int i = 0; i < number_of_dense_blocks; i++) { -// row_sizes[i] = dense_blocks[i]->get_target_cluster()->get_size(); -// col_sizes[i] = dense_blocks[i]->get_source_cluster()->get_size(); -// rows[i] = dense_blocks[i]->get_target_cluster()->get_perm_data(); -// cols[i] = dense_blocks[i]->get_source_cluster()->get_perm_data(); -// ptr[i] = dense_blocks[i]->get_dense_block_data()->data(); -// } -// dense_block_generator.copy_dense_blocks(row_sizes, col_sizes, rows, cols, ptr); -// } - -// // // Compute blocks recursively -// template -// void HMatrix::compute_blocks(const VirtualGenerator &mat) { -// #if defined(_OPENMP) && !defined(PYTHON_INTERFACE) -// # pragma omp parallel -// #endif -// { -// std::vector *> local_dense_leaves{}; -// std::vector *> local_low_rank_leaves{}; -// std::vector &local_tasks = m_block_tree_properties->m_tasks; - -// int false_positive_local = 0; -// #if defined(_OPENMP) && !defined(PYTHON_INTERFACE) -// # pragma omp for schedule(guided) -// #endif -// for (int p = 0; p < m_block_tree_properties->m_tasks.size(); p++) { -// if (!local_tasks[p]->is_admissible()) { -// local_tasks[p]->add_dense_leaf(mat, local_dense_leaves); -// } else { -// // bool not_pushed; -// local_tasks[p]->add_low_rank_leaf(mat, local_low_rank_leaves); -// if (local_tasks[p]->get_low_rank_data()->rank_of() == -1) { -// local_tasks[p]->m_low_rank_data.reset(); -// local_low_rank_leaves.pop_back(); -// local_tasks[p]->add_dense_leaf(mat, local_dense_leaves); -// } -// // if (m_symmetry == 'H' || m_symmetry == 'S') { -// // not_pushed = ComputeAdmissibleBlocksSym(mat, *(local_tasks[p]), xt, xs, MyComputedBlocks_local, MyNearFieldMats_local, MyFarFieldMats_local, false_positive_local); -// // } else { -// // not_pushed = ComputeAdmissibleBlock(mat, *(local_tasks[p]), xt, xs, MyComputedBlocks_local, MyNearFieldMats_local, MyFarFieldMats_local, false_positive_local); -// // } - -// // if (not_pushed) { -// // local_tasks[p]->add_dense_leaf(mat, local_dense_leaves); -// // } -// } -// } -// #if defined(_OPENMP) && !defined(PYTHON_INTERFACE) -// # pragma omp critical -// #endif -// { -// m_block_tree_properties->m_low_rank_leaves.insert(m_block_tree_properties->m_low_rank_leaves.end(), std::make_move_iterator(local_low_rank_leaves.begin()), std::make_move_iterator(local_low_rank_leaves.end())); - -// m_block_tree_properties->m_dense_leaves.insert(m_block_tree_properties->m_dense_leaves.end(), std::make_move_iterator(local_dense_leaves.begin()), std::make_move_iterator(local_dense_leaves.end())); - -// m_block_tree_properties->m_false_positive += false_positive_local; -// } -// } - -// m_block_tree_properties->m_leaves.insert(m_block_tree_properties->m_leaves.end(), m_block_tree_properties->m_dense_leaves.begin(), m_block_tree_properties->m_dense_leaves.end()); -// m_block_tree_properties->m_leaves.insert(m_block_tree_properties->m_leaves.end(), m_block_tree_properties->m_low_rank_leaves.begin(), m_block_tree_properties->m_low_rank_leaves.end()); - -// if (m_block_tree_properties->m_block_diagonal_hmatrix != nullptr) { -// int local_offset_s = m_block_tree_properties->m_block_diagonal_hmatrix->get_source_cluster_tree().get_offset(); -// int local_size_s = m_block_tree_properties->m_block_diagonal_hmatrix->get_source_cluster_tree().get_size(); - -// // Build vectors of pointers for diagonal blocks -// for (auto leaf : m_block_tree_properties->m_leaves) { -// if (local_offset_s <= leaf->get_source_cluster_tree().get_offset() && leaf->get_source_cluster_tree().get_offset() < local_offset_s + local_size_s) { -// m_block_tree_properties->m_leaves_in_diagonal_block.push_back(leaf); -// } -// } -// for (auto low_rank_leaf : m_block_tree_properties->m_low_rank_leaves) { -// if (local_offset_s <= low_rank_leaf->get_source_cluster_tree().get_offset() && low_rank_leaf->get_source_cluster_tree().get_offset() < local_offset_s + local_size_s) { -// m_block_tree_properties->m_low_rank_leaves_in_diagonal_block.push_back(low_rank_leaf); -// if (low_rank_leaf->get_source_cluster_tree().get_offset() == low_rank_leaf->get_target_cluster_tree().get_offset()) { -// m_block_tree_properties->m_diagonal_low_rank_leaves.push_back(low_rank_leaf); -// } -// } -// } -// for (auto dense_leaf : m_block_tree_properties->m_dense_leaves) { -// if (local_offset_s <= dense_leaf->get_source_cluster_tree().get_offset() && dense_leaf->get_source_cluster_tree().get_offset() < local_offset_s + local_size_s) { -// m_block_tree_properties->m_dense_leaves_in_diagonal_block.push_back(dense_leaf); -// if (dense_leaf->get_source_cluster_tree().get_offset() == dense_leaf->get_target_cluster_tree().get_offset()) { -// m_block_tree_properties->m_diagonal_dense_leaves.push_back(dense_leaf); -// } -// } -// } -// } -// } - -// template -// bool HMatrix::ComputeAdmissibleBlock(VirtualGenerator &mat, Block &task, const double *const xt, const double *const xs, std::vector *> &MyComputedBlocks_local, std::vector *> &MyNearFieldMats_local, std::vector *> &MyFarFieldMats_local, int &false_positive_local) { -// if (task.IsAdmissible()) { // When called recursively, it may not be admissible -// AddFarFieldMat(mat, task, xt, xs, MyComputedBlocks_local, MyFarFieldMats_local, reqrank); -// if (MyFarFieldMats_local.back()->get_rank_of() != -1) { -// return false; -// } else { -// MyComputedBlocks_local.back()->clear_data(); -// MyFarFieldMats_local.pop_back(); -// MyComputedBlocks_local.pop_back(); -// false_positive_local += 1; -// } -// } -// // We could compute a dense block if its size is small enough, we focus on improving compression for now -// // else if (task->get_size()get_size(); -// const VirtualCluster &t = m_target_cluster; -// const VirtualCluster &s = m_source_cluster. - -// if (s.IsLeaf()) { -// if (t.IsLeaf()) { -// return true; -// } else { -// std::vector Blocks_not_pushed(m_target_cluster_tree->number_of_children()); -// for (int p = 0; p < m_target_cluster_tree->number_of_children(); p++) { -// task.build_son(m_target_cluster_tree->get_children()[p], s); - -// Blocks_not_pushed[p] = ComputeAdmissibleBlock(mat, task->get_children()[p], xt, xs, MyComputedBlocks_local, MyNearFieldMats_local, MyFarFieldMats_local, false_positive_local); -// } - -// if ((bsize <= maxblocksize) && std::all_of(Blocks_not_pushed.begin(), Blocks_not_pushed.end(), [](bool i) { return i; })) { -// task.clear_sons(); -// return true; -// } else { -// for (int p = 0; p < m_target_cluster_tree->number_of_children(); p++) { -// if (Blocks_not_pushed[p]) { -// AddNearFieldMat(mat, task->get_children()[p], MyComputedBlocks_local, MyNearFieldMats_local); -// } -// } -// return false; -// } -// } -// } else { -// if (t.IsLeaf()) { -// std::vector Blocks_not_pushed(m_source_cluster_tree->number_of_children()); -// for (int p = 0; p < m_source_cluster_tree->number_of_children(); p++) { -// task.build_son(t, m_source_cluster_tree->get_children()[p]); -// Blocks_not_pushed[p] = ComputeAdmissibleBlock(mat, task->get_children()[p], xt, xs, MyComputedBlocks_local, MyNearFieldMats_local, MyFarFieldMats_local, false_positive_local); -// } - -// if ((bsize <= maxblocksize) && std::all_of(Blocks_not_pushed.begin(), Blocks_not_pushed.end(), [](bool i) { return i; })) { -// task.clear_sons(); -// return true; -// } else { -// for (int p = 0; p < m_source_cluster_tree->number_of_children(); p++) { -// if (Blocks_not_pushed[p]) { -// AddNearFieldMat(mat, task->get_children()[p], MyComputedBlocks_local, MyNearFieldMats_local); -// } -// } -// return false; -// } -// } else { -// if (m_target_cluster_tree->get_size() > m_source_cluster_tree->get_size()) { -// std::vector Blocks_not_pushed(m_target_cluster_tree->number_of_children()); -// for (int p = 0; p < m_target_cluster_tree->number_of_children(); p++) { -// task.build_son(m_target_cluster_tree->get_children()[p], s); -// Blocks_not_pushed[p] = ComputeAdmissibleBlock(mat, task->get_children()[p], xt, xs, MyComputedBlocks_local, MyNearFieldMats_local, MyFarFieldMats_local, false_positive_local); -// } - -// if ((bsize <= maxblocksize) && std::all_of(Blocks_not_pushed.begin(), Blocks_not_pushed.end(), [](bool i) { return i; })) { -// task.clear_sons(); -// return true; -// } else { -// for (int p = 0; p < m_target_cluster_tree->number_of_children(); p++) { -// if (Blocks_not_pushed[p]) { -// AddNearFieldMat(mat, task->get_children()[p], MyComputedBlocks_local, MyNearFieldMats_local); -// } -// } -// return false; -// } -// } else { -// std::vector Blocks_not_pushed(m_source_cluster_tree->number_of_children()); -// for (int p = 0; p < m_source_cluster_tree->number_of_children(); p++) { -// task.build_son(t, m_source_cluster_tree->get_children()[p]); -// Blocks_not_pushed[p] = ComputeAdmissibleBlock(mat, task->get_children()[p], xt, xs, MyComputedBlocks_local, MyNearFieldMats_local, MyFarFieldMats_local, false_positive_local); -// } - -// if ((bsize <= maxblocksize) && std::all_of(Blocks_not_pushed.begin(), Blocks_not_pushed.end(), [](bool i) { return i; })) { -// task.clear_sons(); -// return true; -// } else { -// for (int p = 0; p < m_source_cluster_tree->number_of_children(); p++) { -// if (Blocks_not_pushed[p]) { -// AddNearFieldMat(mat, task->get_children()[p], MyComputedBlocks_local, MyNearFieldMats_local); -// } -// } -// return false; -// } -// } -// } -// } -// } - -// template -// bool HMatrix::ComputeAdmissibleBlocksSym(VirtualGenerator &mat, Block &task, const double *const xt, const double *const xs, std::vector *> &MyComputedBlocks_local, std::vector *> &MyNearFieldMats_local, std::vector *> &MyFarFieldMats_local, int &false_positive_local) { - -// if (task.IsAdmissible()) { - -// AddFarFieldMat(mat, task, xt, xs, MyComputedBlocks_local, MyFarFieldMats_local, reqrank); -// if (MyFarFieldMats_local.back()->get_rank_of() != -1) { -// return false; -// } else { -// MyComputedBlocks_local.back()->clear_data(); -// MyFarFieldMats_local.pop_back(); -// MyComputedBlocks_local.pop_back(); -// false_positive_local += 1; -// // AddNearFieldMat(mat, task,MyComputedBlocks_local, MyNearFieldMats_local); -// // return false; -// } -// } -// // We could compute a dense block if its size is small enough, we focus on improving compression for now -// // else if (task->get_size()get_size(); -// const VirtualCluster &t = m_target_cluster; -// const VirtualCluster &s = m_source_cluster. - -// if (s.IsLeaf()) { -// if (t.IsLeaf()) { -// return true; -// } else { -// std::vector Blocks_not_pushed(m_target_cluster_tree->number_of_children()); -// for (int p = 0; p < m_target_cluster_tree->number_of_children(); p++) { -// task.build_son(m_target_cluster_tree->get_children()[p], s); -// Blocks_not_pushed[p] = ComputeAdmissibleBlocksSym(mat, task->get_children()[p], xt, xs, MyComputedBlocks_local, MyNearFieldMats_local, MyFarFieldMats_local, false_positive_local); -// } - -// if ((bsize <= maxblocksize) && std::all_of(Blocks_not_pushed.begin(), Blocks_not_pushed.end(), [](bool i) { return i; })) { -// task.clear_sons(); -// return true; -// } else { -// for (int p = 0; p < m_target_cluster_tree->number_of_children(); p++) { -// if (Blocks_not_pushed[p]) { -// AddNearFieldMat(mat, task->get_children()[p], MyComputedBlocks_local, MyNearFieldMats_local); -// } -// } -// return false; -// } -// } -// } else { -// if (t.IsLeaf()) { -// std::vector Blocks_not_pushed(m_source_cluster_tree->number_of_children()); -// for (int p = 0; p < m_source_cluster_tree->number_of_children(); p++) { -// task.build_son(t, m_source_cluster_tree->get_children()[p]); -// Blocks_not_pushed[p] = ComputeAdmissibleBlocksSym(mat, task->get_children()[p], xt, xs, MyComputedBlocks_local, MyNearFieldMats_local, MyFarFieldMats_local, false_positive_local); -// } - -// if ((bsize <= maxblocksize) && std::all_of(Blocks_not_pushed.begin(), Blocks_not_pushed.end(), [](bool i) { return i; })) { -// task.clear_sons(); -// return true; -// } else { -// for (int p = 0; p < m_source_cluster_tree->number_of_children(); p++) { -// if (Blocks_not_pushed[p]) { -// AddNearFieldMat(mat, task->get_children()[p], MyComputedBlocks_local, MyNearFieldMats_local); -// } -// } -// return false; -// } -// } else { -// std::vector Blocks_not_pushed(m_target_cluster_tree->number_of_children() * m_source_cluster_tree->number_of_children()); -// for (int l = 0; l < m_source_cluster_tree->number_of_children(); l++) { -// for (int p = 0; p < m_target_cluster_tree->number_of_children(); p++) { -// task.build_son(m_target_cluster_tree->get_children()[p], m_source_cluster_tree->get_son(l)); -// Blocks_not_pushed[p + l * m_target_cluster_tree->number_of_children()] = ComputeAdmissibleBlocksSym(mat, task->get_son(p + l * m_target_cluster_tree->number_of_children()), xt, xs, MyComputedBlocks_local, MyNearFieldMats_local, MyFarFieldMats_local, false_positive_local); -// } -// } -// if ((bsize <= maxblocksize) && std::all_of(Blocks_not_pushed.begin(), Blocks_not_pushed.end(), [](bool i) { return i; })) { -// task.clear_sons(); -// return true; -// } else { -// for (int p = 0; p < Blocks_not_pushed.size(); p++) { -// if (Blocks_not_pushed[p]) { -// AddNearFieldMat(mat, task->get_children()[p], MyComputedBlocks_local, MyNearFieldMats_local); -// } -// } -// return false; -// } -// } -// } -// } - -// // Build a dense block -// template -// void HMatrix::add_dense_leaf(const VirtualGenerator &mat, std::vector *> &local_dense_leaves) { - -// if (!m_block_tree_properties->m_delay_dense_computation) { -// m_dense_data = std::unique_ptr>(new Matrix(m_target_cluster_tree->get_size(), m_source_cluster_tree->get_size())); - -// mat.copy_submatrix(m_target_cluster_tree->get_size(), m_source_cluster_tree->get_size(), m_target_cluster_tree->get_offset(), m_source_cluster_tree->get_offset(), m_dense_data->data()); -// } -// local_dense_leaves.push_back(this); -// m_storage_type = StorageType::Dense; -// } - -// // // Build a low rank block -// template -// void HMatrix::add_low_rank_leaf(const VirtualGenerator &mat, std::vector *> &local_low_rank_leaves) { - -// m_low_rank_data = std::unique_ptr>(new LowRankMatrix(mat, *m_block_tree_properties->m_low_rank_generator, *m_target_cluster_tree, *m_source_cluster_tree, m_block_tree_properties->m_reqrank, m_block_tree_properties->m_epsilon)); - -// local_low_rank_leaves.push_back(this); -// m_storage_type = StorageType::LowRank; -// } - -// // Compute infos -// template -// void HMatrix::ComputeInfos(const std::vector &mytime) { -// // 0 : block tree ; 1 : compute blocks ; -// std::vector maxtime(2), meantime(2); -// // 0 : dense mat ; 1 : lr mat ; 2 : rank ; 3 : local_size -// std::vector maxinfos(4, 0), mininfos(4, std::max(nc, nr)); -// std::vector meaninfos(4, 0); -// // Infos -// for (int i = 0; i < MyNearFieldMats.size(); i++) { -// std::size_t size = MyNearFieldMats[i]->get_target_cluster()->get_size() * MyNearFieldMats[i]->get_source_cluster()->get_size(); -// maxinfos[0] = std::max(maxinfos[0], size); -// mininfos[0] = std::min(mininfos[0], size); -// meaninfos[0] += size; -// } -// for (int i = 0; i < MyFarFieldMats.size(); i++) { -// std::size_t size = MyFarFieldMats[i]->get_target_cluster()->get_size() * MyFarFieldMats[i]->get_source_cluster()->get_size(); -// std::size_t rank = MyFarFieldMats[i]->get_rank_of(); -// maxinfos[1] = std::max(maxinfos[1], size); -// mininfos[1] = std::min(mininfos[1], size); -// meaninfos[1] += size; -// maxinfos[2] = std::max(maxinfos[2], rank); -// mininfos[2] = std::min(mininfos[2], rank); -// meaninfos[2] += rank; -// } -// maxinfos[3] = local_size; -// mininfos[3] = local_size; -// meaninfos[3] = local_size; - -// if (rankWorld == 0) { -// MPI_Reduce(MPI_IN_PLACE, &(maxinfos[0]), 4, my_MPI_SIZE_T, MPI_MAX, 0, comm); -// MPI_Reduce(MPI_IN_PLACE, &(mininfos[0]), 4, my_MPI_SIZE_T, MPI_MIN, 0, comm); -// MPI_Reduce(MPI_IN_PLACE, &(meaninfos[0]), 4, MPI_DOUBLE, MPI_SUM, 0, comm); -// MPI_Reduce(MPI_IN_PLACE, &(false_positive), 1, MPI_INT, MPI_SUM, 0, comm); -// } else { -// MPI_Reduce(&(maxinfos[0]), &(maxinfos[0]), 4, my_MPI_SIZE_T, MPI_MAX, 0, comm); -// MPI_Reduce(&(mininfos[0]), &(mininfos[0]), 4, my_MPI_SIZE_T, MPI_MIN, 0, comm); -// MPI_Reduce(&(meaninfos[0]), &(meaninfos[0]), 4, MPI_DOUBLE, MPI_SUM, 0, comm); -// MPI_Reduce(&(false_positive), &(false_positive), 1, MPI_INT, MPI_SUM, 0, comm); -// } - -// int nlrmat = this->get_nlrmat(); -// int ndmat = this->get_ndmat(); -// meaninfos[0] = (ndmat == 0 ? 0 : meaninfos[0] / ndmat); -// meaninfos[1] = (nlrmat == 0 ? 0 : meaninfos[1] / nlrmat); -// meaninfos[2] = (nlrmat == 0 ? 0 : meaninfos[2] / nlrmat); -// meaninfos[3] = meaninfos[3] / sizeWorld; -// mininfos[0] = (ndmat == 0 ? 0 : mininfos[0]); -// mininfos[1] = (nlrmat == 0 ? 0 : mininfos[1]); -// mininfos[2] = (nlrmat == 0 ? 0 : mininfos[2]); - -// // timing -// MPI_Reduce(&(mytime[0]), &(maxtime[0]), 2, MPI_DOUBLE, MPI_MAX, 0, comm); -// MPI_Reduce(&(mytime[0]), &(meantime[0]), 2, MPI_DOUBLE, MPI_SUM, 0, comm); - -// meantime /= sizeWorld; - -// infos["Block_tree_mean"] = NbrToStr(meantime[0]); -// infos["Block_tree_max"] = NbrToStr(maxtime[0]); -// infos["Blocks_mean"] = NbrToStr(meantime[1]); -// infos["Blocks_max"] = NbrToStr(maxtime[1]); - -// // Size -// infos["Source_size"] = NbrToStr(this->nc); -// infos["Target_size"] = NbrToStr(this->nr); -// infos["Dimension"] = NbrToStr(this->dimension); -// infos["Dense_block_size_max"] = NbrToStr(maxinfos[0]); -// infos["Dense_block_size_mean"] = NbrToStr(meaninfos[0]); -// infos["Dense_block_size_min"] = NbrToStr(mininfos[0]); -// infos["Low_rank_block_size_max"] = NbrToStr(maxinfos[1]); -// infos["Low_rank_block_size_mean"] = NbrToStr(meaninfos[1]); -// infos["Low_rank_block_size_min"] = NbrToStr(mininfos[1]); - -// infos["Rank_max"] = NbrToStr(maxinfos[2]); -// infos["Rank_mean"] = NbrToStr(meaninfos[2]); -// infos["Rank_min"] = NbrToStr(mininfos[2]); -// infos["Number_of_lrmat"] = NbrToStr(nlrmat); -// infos["Number_of_dmat"] = NbrToStr(ndmat); -// infos["Number_of_false_positive"] = NbrToStr(false_positive); -// infos["Local_compressed_size"] = NbrToStr(this->local_compressed_size()); -// infos["Compression_ratio"] = NbrToStr(this->compression_ratio()); -// infos["Space_saving"] = NbrToStr(this->space_saving()); -// infos["Local_size_max"] = NbrToStr(maxinfos[3]); -// infos["Local_size_mean"] = NbrToStr(meaninfos[3]); -// infos["Local_size_min"] = NbrToStr(mininfos[3]); - -// infos["Number_of_MPI_tasks"] = NbrToStr(sizeWorld); -// #if defined(_OPENMP) -// infos["Number_of_threads_per_tasks"] = NbrToStr(omp_get_max_threads()); -// infos["Number_of_procs"] = NbrToStr(sizeWorld * omp_get_max_threads()); -// #else -// infos["Number_of_procs"] = NbrToStr(sizeWorld); -// #endif - -// infos["Eta"] = NbrToStr(eta); -// infos["Eps"] = NbrToStr(epsilon); -// infos["MinTargetDepth"] = NbrToStr(mintargetdepth); -// infos["MinSourceDepth"] = NbrToStr(minsourcedepth); -// infos["MinClusterSizeTarget"] = NbrToStr(cluster_tree_t->get_minclustersize()); -// infos["MinClusterSizeSource"] = NbrToStr(cluster_tree_s->get_minclustersize()); -// infos["MinClusterDepthTarget"] = NbrToStr(cluster_tree_t->get_min_depth()); -// infos["MaxClusterDepthTarget"] = NbrToStr(cluster_tree_t->get_max_depth()); -// infos["MinClusterDepthSource"] = NbrToStr(cluster_tree_s->get_min_depth()); -// infos["MaxClusterDepthSource"] = NbrToStr(cluster_tree_s->get_max_depth()); -// infos["MaxBlockSize"] = NbrToStr(maxblocksize); -// } - -// template -// void HMatrix::add_vector_product(char trans, CoefficientPrecision alpha, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out) const { - -// int target_size = m_target_cluster_tree->get_size(); -// int source_size = m_source_cluster_tree->get_size(); -// int source_offset = m_source_cluster_tree->get_offset(); -// int target_offset = m_target_cluster_tree->get_offset(); - -// int incx(1), incy(1); -// CoefficientPrecision da(1); -// const auto &blocks = m_block_tree_properties->m_tasks; -// const auto &blocks_in_diagonal_block = m_block_tree_properties->m_leaves_in_diagonal_block; -// const auto &diagonal_dense_blocks = m_block_tree_properties->m_diagonal_dense_leaves; -// CoefficientPrecision local_beta = 0; - -// char symmetry_trans; -// if (m_block_tree_properties->m_symmetry == 'S') { -// if (trans == 'N') { -// symmetry_trans = 'T'; -// } else { -// symmetry_trans = 'N'; -// } -// } else if (m_block_tree_properties->m_symmetry == 'H') { -// if (trans == 'N') { -// symmetry_trans = 'C'; -// } else { -// symmetry_trans = 'N'; -// } -// } - -// std::transform(out, out + target_size, out, [beta](CoefficientPrecision a) { return a * beta; }); - -// // Contribution champ lointain -// #if defined(_OPENMP) -// # pragma omp parallel -// #endif -// { -// std::vector temp(target_size, 0); -// #if defined(_OPENMP) -// # pragma omp for schedule(guided) nowait -// #endif -// for (int b = 0; b < blocks.size(); b++) { -// int offset_i = blocks[b]->get_target_cluster_tree().get_offset() - target_offset; -// int offset_j = blocks[b]->get_source_cluster_tree().get_offset() - source_offset; -// std::cout << offset_i << " " << offset_j << "\n"; -// if (m_block_tree_properties->m_symmetry == 'N' || offset_i != offset_j) { // remove strictly diagonal blocks -// blocks[b]->is_dense() ? blocks[b]->get_dense_data()->add_vector_product(trans, alpha, in + offset_j, local_beta, temp.data() + offset_i) : blocks[b]->get_low_rank_data()->add_vector_product(trans, alpha, in + offset_j, local_beta, temp.data() + offset_i); -// ; -// } -// } - -// // Symmetry part of the diagonal part -// if (m_block_tree_properties->m_symmetry != 'N') { -// #if defined(_OPENMP) -// # pragma omp for schedule(guided) nowait -// #endif -// for (int b = 0; b < blocks_in_diagonal_block.size(); b++) { -// int offset_i = blocks_in_diagonal_block[b]->get_source_cluster_tree().get_offset(); -// int offset_j = blocks_in_diagonal_block[b]->get_target_cluster_tree().get_offset(); - -// if (offset_i != offset_j) { // remove strictly diagonal blocks -// blocks[b]->is_dense() ? blocks_in_diagonal_block[b]->get_dense_data()->add_vector_product(symmetry_trans, alpha, in + offset_j - target_offset, local_beta, temp.data() + (offset_i - source_offset)) : blocks_in_diagonal_block[b]->get_low_rank_data()->add_vector_product(symmetry_trans, alpha, in + offset_j - target_offset, local_beta, temp.data() + (offset_i - source_offset)); -// } -// } - -// #if defined(_OPENMP) -// # pragma omp for schedule(guided) nowait -// #endif -// for (int b = 0; b < diagonal_dense_blocks.size(); b++) { -// int offset_i = diagonal_dense_blocks[b]->get_target_cluster_tree().get_offset(); -// int offset_j = diagonal_dense_blocks[b]->get_source_cluster_tree().get_offset(); - -// diagonal_dense_blocks[b]->get_dense_data()->add_vector_product_symmetric(trans, alpha, in + offset_j - source_offset, local_beta, temp.data() + (offset_i - target_offset), m_block_tree_properties->m_UPLO, m_block_tree_properties->m_symmetry); -// } -// } - -// #if defined(_OPENMP) -// # pragma omp critical -// #endif - -// Blas::axpy(&target_size, &da, temp.data(), &incx, out, &incy); -// } -// } - -template -void HMatrix::threaded_hierarchical_add_vector_product(char trans, CoefficientPrecision alpha, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out) const { - - set_leaves_in_cache(); - - int out_size(m_target_cluster->get_size()); - auto get_output_cluster{&HMatrix::get_target_cluster}; - auto get_input_cluster{&HMatrix::get_source_cluster}; - int local_input_offset = m_source_cluster->get_offset(); - int local_output_offset = m_target_cluster->get_offset(); - char trans_sym = (m_symmetry_type_for_leaves == 'S') ? 'T' : 'C'; - - if (trans != 'N') { - out_size = m_source_cluster->get_size(); - get_input_cluster = &HMatrix::get_target_cluster; - get_output_cluster = &HMatrix::get_source_cluster; - local_input_offset = m_target_cluster->get_offset(); - local_output_offset = m_source_cluster->get_offset(); - trans_sym = 'N'; - } - - int incx(1), incy(1); - if (CoefficientPrecision(beta) != CoefficientPrecision(1)) { - // TODO: use blas - std::transform(out, out + out_size, out, [&beta](CoefficientPrecision &c) { return c * beta; }); - } - -// Contribution champ lointain -#if defined(_OPENMP) -# pragma omp parallel -#endif - { - std::vector temp(out_size, 0); -#if defined(_OPENMP) -# pragma omp for schedule(guided) nowait -#endif - for (int b = 0; b < m_leaves.size(); b++) { - int input_offset = (m_leaves[b]->*get_input_cluster)().get_offset(); - int output_offset = (m_leaves[b]->*get_output_cluster)().get_offset(); - m_leaves[b]->add_vector_product(trans, 1, in + input_offset - local_input_offset, 1, temp.data() + (output_offset - local_output_offset)); - } - - // Symmetry part of the diagonal part - if (m_symmetry_type_for_leaves != 'N') { -#if defined(_OPENMP) -# pragma omp for schedule(guided) nowait -#endif - for (int b = 0; b < m_leaves_for_symmetry.size(); b++) { - int input_offset = (m_leaves_for_symmetry[b]->*get_input_cluster)().get_offset(); - int output_offset = (m_leaves_for_symmetry[b]->*get_output_cluster)().get_offset(); - m_leaves_for_symmetry[b]->add_vector_product(trans_sym, 1, in + output_offset - local_input_offset, 1, temp.data() + (input_offset - local_output_offset)); - } - } - -#if defined(_OPENMP) -# pragma omp critical -#endif - Blas::axpy(&out_size, &alpha, temp.data(), &incx, out, &incy); - } -} - -template -void HMatrix::threaded_hierarchical_add_matrix_product_row_major(char trans, CoefficientPrecision alpha, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out, int mu) const { - - set_leaves_in_cache(); - - if ((trans == 'T' && m_symmetry_type_for_leaves == 'H') - || (trans == 'C' && m_symmetry_type_for_leaves == 'S')) { - htool::Logger::get_instance().log(LogLevel::ERROR, "Operation is not supported (" + std::string(1, trans) + " with " + m_symmetry_type_for_leaves + ")"); // LCOV_EXCL_LINE - // throw std::invalid_argument("[Htool error] Operation is not supported (" + std::string(1, trans) + " with " + m_symmetry_type_for_leaves + ")"); // LCOV_EXCL_LINE - } - - int out_size(m_target_cluster->get_size() * mu); - auto get_output_cluster{&HMatrix::get_target_cluster}; - auto get_input_cluster{&HMatrix::get_source_cluster}; - int local_output_offset = m_target_cluster->get_offset(); - int local_input_offset = m_source_cluster->get_offset(); - char trans_sym = (m_symmetry_type_for_leaves == 'S') ? 'T' : 'C'; - - if (trans != 'N') { - out_size = m_source_cluster->get_size() * mu; - get_input_cluster = &HMatrix::get_target_cluster; - get_output_cluster = &HMatrix::get_source_cluster; - local_input_offset = m_target_cluster->get_offset(); - local_output_offset = m_source_cluster->get_offset(); - trans_sym = 'N'; - } - - int incx(1), incy(1); - if (CoefficientPrecision(beta) != CoefficientPrecision(1)) { - // TODO: use blas - std::transform(out, out + out_size, out, [&beta](CoefficientPrecision &c) { return c * beta; }); - } - -// Contribution champ lointain -#if defined(_OPENMP) -# pragma omp parallel -#endif - { - std::vector temp(out_size, 0); -#if defined(_OPENMP) -# pragma omp for schedule(guided) nowait -#endif - for (int b = 0; b < m_leaves.size(); b++) { - int input_offset = (m_leaves[b]->*get_input_cluster)().get_offset(); - int output_offset = (m_leaves[b]->*get_output_cluster)().get_offset(); - m_leaves[b]->add_matrix_product_row_major(trans, 1, in + (input_offset - local_input_offset) * mu, 1, temp.data() + (output_offset - local_output_offset) * mu, mu); - } - - // Symmetry part of the diagonal part - if (m_symmetry_type_for_leaves != 'N') { -#if defined(_OPENMP) -# pragma omp for schedule(guided) nowait -#endif - for (int b = 0; b < m_leaves_for_symmetry.size(); b++) { - int input_offset = (m_leaves_for_symmetry[b]->*get_input_cluster)().get_offset(); - int output_offset = (m_leaves_for_symmetry[b]->*get_output_cluster)().get_offset(); - m_leaves_for_symmetry[b]->add_matrix_product_row_major(trans_sym, 1, in + (output_offset - local_input_offset) * mu, 1, temp.data() + (input_offset - local_output_offset) * mu, mu); - } - } - -#if defined(_OPENMP) -# pragma omp critical -#endif - Blas::axpy(&out_size, &alpha, temp.data(), &incx, out, &incy); - } -} - -// template -// void HMatrix::mymvprod_transp_local_to_global(const T *const in, T *const out, const int &mu) const { -// std::fill(out, out + this->nc * mu, 0); -// int incx(1), incy(1); -// int global_size_rhs = this->nc * mu; -// T da(1); - -// // Contribution champ lointain -// #if defined(_OPENMP) -// # pragma omp parallel -// #endif -// { -// std::vector temp(this->nc * mu, 0); -// #if defined(_OPENMP) -// # pragma omp for schedule(guided) nowait -// #endif -// for (int b = 0; b < MyComputedBlocks.size(); b++) { -// int offset_i = MyComputedBlocks[b]->get_target_cluster()->get_offset(); -// int offset_j = MyComputedBlocks[b]->get_source_cluster()->get_offset(); -// if (!(m_symmetry != 'N') || offset_i != offset_j) { // remove strictly diagonal blocks -// MyComputedBlocks[b]->get_block_data()->add_mvprod_row_major(in + (offset_i - local_offset) * mu, temp.data() + offset_j * mu, mu, 'T', 'T'); -// } -// } - -// #if defined(_OPENMP) -// # pragma omp critical -// #endif -// Blas::axpy(&(global_size_rhs), &da, temp.data(), &incx, out, &incy); -// } - -// MPI_Allreduce(MPI_IN_PLACE, out, this->nc * mu, wrapper_mpi::mpi_type(), MPI_SUM, comm); -// } - -// template -// void HMatrix::local_to_global_target(const T *const in, T *const out, const int &mu) const { -// // Allgather -// std::vector recvcounts(sizeWorld); -// std::vector displs(sizeWorld); - -// displs[0] = 0; - -// for (int i = 0; i < sizeWorld; i++) { -// recvcounts[i] = (cluster_tree_t->get_masteroffset(i).second) * mu; -// if (i > 0) -// displs[i] = displs[i - 1] + recvcounts[i - 1]; -// } - -// MPI_Allgatherv(in, recvcounts[rankWorld], wrapper_mpi::mpi_type(), out, &(recvcounts[0]), &(displs[0]), wrapper_mpi::mpi_type(), comm); -// } - -// template -// void HMatrix::local_to_global_source(const T *const in, T *const out, const int &mu) const { -// // Allgather -// std::vector recvcounts(sizeWorld); -// std::vector displs(sizeWorld); - -// displs[0] = 0; - -// for (int i = 0; i < sizeWorld; i++) { -// recvcounts[i] = (cluster_tree_s->get_masteroffset(i).second) * mu; -// if (i > 0) -// displs[i] = displs[i - 1] + recvcounts[i - 1]; -// } -// MPI_Allgatherv(in, recvcounts[rankWorld], wrapper_mpi::mpi_type(), out, &(recvcounts[0]), &(displs[0]), wrapper_mpi::mpi_type(), comm); -// } - -// template -// void HMatrix::mymvprod_local_to_local(const T *const in, T *const out, const int &mu, T *work) const { -// double time = MPI_Wtime(); -// bool need_delete = false; -// if (work == nullptr) { -// work = new T[this->nc * mu]; -// need_delete = true; -// } -// this->local_to_global_source(in, work, mu); -// this->mymvprod_global_to_local(work, out, mu); - -// if (need_delete) { -// delete[] work; -// work = nullptr; -// } -// infos["nb_mat_vec_prod"] = NbrToStr(1 + StrToNbr(infos["nb_mat_vec_prod"])); -// infos["total_time_mat_vec_prod"] = NbrToStr(MPI_Wtime() - time + StrToNbr(infos["total_time_mat_vec_prod"])); -// } - -// template -// void HMatrix::mymvprod_transp_local_to_local(const T *const in, T *const out, const int &mu, T *work) const { -// int local_size_source = cluster_tree_s->get_masteroffset(rankWorld).second; - -// if (this->m_symmetry == 'S' || this->m_symmetry == 'H') { -// this->mymvprod_local_to_local(in, out, mu, work); -// return; -// } - -// double time = MPI_Wtime(); -// bool need_delete = false; -// if (work == nullptr) { -// work = new T[(this->nc + local_size_source * sizeWorld) * mu]; -// need_delete = true; -// } - -// std::fill(out, out + local_size_source * mu, 0); -// int incx(1), incy(1); -// int global_size_rhs = this->nc * mu; -// T da(1); - -// std::fill(work, work + this->nc * mu, 0); -// T *rbuf = work + this->nc * mu; - -// // Contribution champ lointain -// #if defined(_OPENMP) -// # pragma omp parallel -// #endif -// { -// std::vector temp(this->nc * mu, 0); -// #if defined(_OPENMP) -// # pragma omp for schedule(guided) nowait -// #endif -// for (int b = 0; b < MyComputedBlocks.size(); b++) { -// int offset_i = MyComputedBlocks[b]->get_target_cluster()->get_offset(); -// int offset_j = MyComputedBlocks[b]->get_source_cluster()->get_offset(); -// if (!(m_symmetry != 'N') || offset_i != offset_j) { // remove strictly diagonal blocks -// MyComputedBlocks[b]->get_block_data()->add_mvprod_row_major(in + (offset_i - local_offset) * mu, temp.data() + offset_j * mu, mu, 'T', 'T'); -// } -// } - -// #if defined(_OPENMP) -// # pragma omp critical -// #endif -// Blas::axpy(&(global_size_rhs), &da, temp.data(), &incx, work, &incy); -// } - -// std::vector scounts(sizeWorld), rcounts(sizeWorld); -// std::vector sdispls(sizeWorld), rdispls(sizeWorld); - -// sdispls[0] = 0; -// rdispls[0] = 0; - -// for (int i = 0; i < sizeWorld; i++) { -// scounts[i] = (cluster_tree_s->get_masteroffset(i).second) * mu; -// rcounts[i] = (local_size_source)*mu; -// if (i > 0) { -// sdispls[i] = sdispls[i - 1] + scounts[i - 1]; -// rdispls[i] = rdispls[i - 1] + rcounts[i - 1]; -// } -// } - -// MPI_Alltoallv(work, &(scounts[0]), &(sdispls[0]), wrapper_mpi::mpi_type(), rbuf, &(rcounts[0]), &(rdispls[0]), wrapper_mpi::mpi_type(), comm); - -// for (int i = 0; i < sizeWorld; i++) -// std::transform(out, out + local_size_source * mu, rbuf + rdispls[i], out, std::plus()); - -// if (need_delete) { -// delete[] work; -// work = nullptr; -// } -// infos["nb_mat_vec_prod"] = NbrToStr(1 + StrToNbr(infos["nb_mat_vec_prod"])); -// infos["total_time_mat_vec_prod"] = NbrToStr(MPI_Wtime() - time + StrToNbr(infos["total_time_mat_vec_prod"])); -// } - -// template -// void HMatrix::mvprod_global_to_global(const T *const in, T *const out, const int &mu) const { -// double time = MPI_Wtime(); - -// if (mu == 1) { -// std::vector out_perm(local_size); -// std::vector buffer(std::max(nc, nr)); - -// // Permutation -// if (use_permutation) { -// this->source_to_cluster_permutation(in, buffer.data()); -// mymvprod_global_to_local(buffer.data(), out_perm.data(), 1); - -// } else { -// mymvprod_global_to_local(in, out_perm.data(), 1); -// } - -// // Allgather -// std::vector recvcounts(sizeWorld); -// std::vector displs(sizeWorld); - -// displs[0] = 0; - -// for (int i = 0; i < sizeWorld; i++) { -// recvcounts[i] = cluster_tree_t->get_masteroffset(i).second * mu; -// if (i > 0) -// displs[i] = displs[i - 1] + recvcounts[i - 1]; -// } - -// if (use_permutation) { -// MPI_Allgatherv(out_perm.data(), recvcounts[rankWorld], wrapper_mpi::mpi_type(), buffer.data(), &(recvcounts[0]), &(displs[0]), wrapper_mpi::mpi_type(), comm); - -// // Permutation -// this->cluster_to_target_permutation(buffer.data(), out); -// } else { -// MPI_Allgatherv(out_perm.data(), recvcounts[rankWorld], wrapper_mpi::mpi_type(), out, &(recvcounts[0]), &(displs[0]), wrapper_mpi::mpi_type(), comm); -// } - -// } else { - -// std::vector in_perm(std::max(nr, nc) * mu * 2); -// std::vector out_perm(local_size * mu); -// std::vector buffer(nc); - -// for (int i = 0; i < mu; i++) { -// // Permutation -// if (use_permutation) { -// this->source_to_cluster_permutation(in + i * nc, buffer.data()); -// // Transpose -// for (int j = 0; j < nc; j++) { -// in_perm[i + j * mu] = buffer[j]; -// } -// } else { -// // Transpose -// for (int j = 0; j < nc; j++) { -// in_perm[i + j * mu] = in[j + i * nc]; -// } -// } -// } - -// if (m_symmetry == 'H') { -// conj_if_complex(in_perm.data(), nc * mu); -// } - -// mymvprod_global_to_local(in_perm.data(), in_perm.data() + nc * mu, mu); - -// // Tranpose -// for (int i = 0; i < mu; i++) { -// for (int j = 0; j < local_size; j++) { -// out_perm[i * local_size + j] = in_perm[i + j * mu + nc * mu]; -// } -// } - -// if (m_symmetry == 'H') { -// conj_if_complex(out_perm.data(), out_perm.size()); -// } - -// // Allgather -// std::vector recvcounts(sizeWorld); -// std::vector displs(sizeWorld); - -// displs[0] = 0; - -// for (int i = 0; i < sizeWorld; i++) { -// recvcounts[i] = cluster_tree_t->get_masteroffset(i).second * mu; -// if (i > 0) -// displs[i] = displs[i - 1] + recvcounts[i - 1]; -// } - -// MPI_Allgatherv(out_perm.data(), recvcounts[rankWorld], wrapper_mpi::mpi_type(), in_perm.data() + mu * nr, &(recvcounts[0]), &(displs[0]), wrapper_mpi::mpi_type(), comm); - -// for (int i = 0; i < mu; i++) { -// if (use_permutation) { -// for (int j = 0; j < sizeWorld; j++) { -// std::copy_n(in_perm.data() + mu * nr + displs[j] + i * recvcounts[j] / mu, recvcounts[j] / mu, in_perm.data() + i * nr + displs[j] / mu); -// } - -// // Permutation -// this->cluster_to_target_permutation(in_perm.data() + i * nr, out + i * nr); -// } else { -// for (int j = 0; j < sizeWorld; j++) { -// std::copy_n(in_perm.data() + mu * nr + displs[j] + i * recvcounts[j] / mu, recvcounts[j] / mu, out + i * nr + displs[j] / mu); -// } -// } -// } -// } -// // Timing -// infos["nb_mat_vec_prod"] = NbrToStr(1 + StrToNbr(infos["nb_mat_vec_prod"])); -// infos["total_time_mat_vec_prod"] = NbrToStr(MPI_Wtime() - time + StrToNbr(infos["total_time_mat_vec_prod"])); -// } - -// template -// void HMatrix::mvprod_local_to_local(const T *const in, T *const out, const int &mu, T *work) const { -// double time = MPI_Wtime(); -// bool need_delete = false; -// if (work == nullptr) { -// work = new T[this->nc * mu]; -// need_delete = true; -// } - -// int local_size_source = cluster_tree_s->get_masteroffset(rankWorld).second; - -// if (!(cluster_tree_s->IsLocal()) || !(cluster_tree_t->IsLocal())) { -// throw std::logic_error("[Htool error] Permutation is not local, mvprod_local_to_local cannot be used"); // LCOV_EXCL_LINE -// } -// if (mu == 1) { -// std::vector in_perm(local_size_source), out_perm(local_size); - -// // local permutation -// if (use_permutation) { -// // permutation -// this->local_source_to_local_cluster(in, in_perm.data()); - -// // prod -// mymvprod_local_to_local(in_perm.data(), out_perm.data(), 1, work); - -// // permutation -// this->local_cluster_to_local_target(out_perm.data(), out, comm); - -// } else { -// mymvprod_local_to_local(in, out, 1, work); -// } - -// } else { - -// std::vector in_perm(local_size_source * mu); -// std::vector out_perm(local_size * mu); -// std::vector buffer(std::max(local_size_source, local_size)); - -// for (int i = 0; i < mu; i++) { -// // local permutation -// if (use_permutation) { -// this->local_source_to_local_cluster(in + i * local_size_source, buffer.data()); - -// // Transpose -// for (int j = 0; j < local_size_source; j++) { -// in_perm[i + j * mu] = buffer[j]; -// } -// } else { -// // Transpose -// for (int j = 0; j < local_size_source; j++) { -// in_perm[i + j * mu] = in[j + i * local_size_source]; -// } -// } -// } - -// if (m_symmetry == 'H') { -// conj_if_complex(in_perm.data(), local_size_source * mu); -// } - -// mymvprod_local_to_local(in_perm.data(), out_perm.data(), mu, work); - -// for (int i = 0; i < mu; i++) { -// if (use_permutation) { -// // Tranpose -// for (int j = 0; j < local_size; j++) { -// buffer[j] = out_perm[i + j * mu]; -// } - -// // local permutation -// this->local_cluster_to_local_target(buffer.data(), out + i * local_size); -// } else { -// // Tranpose -// for (int j = 0; j < local_size; j++) { -// out[j + i * local_size] = out_perm[i + j * mu]; -// } -// } -// } - -// if (m_symmetry == 'H') { -// conj_if_complex(out, out_perm.size()); -// } -// } - -// if (need_delete) { -// delete[] work; -// work = nullptr; -// } -// // Timing -// infos["nb_mat_vec_prod"] = NbrToStr(1 + StrToNbr(infos["nb_mat_vec_prod"])); -// infos["total_time_mat_vec_prod"] = NbrToStr(MPI_Wtime() - time + StrToNbr(infos["total_time_mat_vec_prod"])); -// } - -// template -// void HMatrix::mvprod_transp_global_to_global(const T *const in, T *const out, const int &mu) const { -// double time = MPI_Wtime(); -// if (this->m_symmetry == 'S') { -// this->mvprod_global_to_global(in, out, mu); -// return; -// } else if (this->m_symmetry == 'H') { -// std::vector in_conj(in, in + nr * mu); -// conj_if_complex(in_conj.data(), nr * mu); -// this->mvprod_global_to_global(in_conj.data(), out, mu); -// conj_if_complex(out, mu * nc); -// return; -// } -// if (mu == 1) { - -// if (use_permutation) { -// std::vector in_perm(nr), out_perm(nc); - -// // permutation -// this->target_to_cluster_permutation(in, in_perm.data()); - -// mymvprod_transp_local_to_global(in_perm.data() + local_offset, out_perm.data(), 1); - -// // permutation -// this->cluster_to_source_permutation(out_perm.data(), out); -// } else { -// mymvprod_transp_local_to_global(in + local_offset, out, 1); -// } - -// } else { - -// std::vector out_perm(mu * nc); -// std::vector in_perm(local_size * mu + mu * nc); -// std::vector buffer(nr); - -// for (int i = 0; i < mu; i++) { -// // Permutation -// if (use_permutation) { -// this->target_to_cluster_permutation(in + i * nr, buffer.data()); -// // Transpose -// for (int j = local_offset; j < local_offset + local_size; j++) { -// in_perm[i + (j - local_offset) * mu] = buffer[j]; -// } -// } else { -// // Transpose -// for (int j = local_offset; j < local_offset + local_size; j++) { -// in_perm[i + (j - local_offset) * mu] = in[j + i * nr]; -// } -// } -// } - -// // It should never happen since we use mvprod_global_to_global in this case -// // if (m_symmetry == 'H') { -// // conj_if_complex(in_perm.data(), local_size * mu); -// // } - -// mymvprod_transp_local_to_global(in_perm.data(), in_perm.data() + local_size * mu, mu); - -// for (int i = 0; i < mu; i++) { -// if (use_permutation) { -// // Transpose -// for (int j = 0; j < nc; j++) { -// out_perm[i * nc + j] = in_perm[i + j * mu + local_size * mu]; -// } -// cluster_to_source_permutation(out_perm.data() + i * nc, out + i * nc); -// } else { -// for (int j = 0; j < nc; j++) { -// out[i * nc + j] = in_perm[i + j * mu + local_size * mu]; -// } -// } -// } - -// // It should never happen since we use mvprod_global_to_global in this case -// // if (m_symmetry == 'H') { -// // conj_if_complex(out, nc * mu); -// // } -// } -// // Timing -// infos["nb_mat_vec_prod"] = NbrToStr(1 + StrToNbr(infos["nb_mat_vec_prod"])); -// infos["total_time_mat_vec_prod"] = NbrToStr(MPI_Wtime() - time + StrToNbr(infos["total_time_mat_vec_prod"])); -// } - -// template -// void HMatrix::mvprod_transp_local_to_local(const T *const in, T *const out, const int &mu, T *work) const { -// double time = MPI_Wtime(); -// int local_size_source = cluster_tree_s->get_masteroffset(rankWorld).second; -// if (this->m_symmetry == 'S') { -// this->mvprod_local_to_local(in, out, mu); -// return; -// } else if (this->m_symmetry == 'H') { -// std::vector in_conj(in, in + local_size * mu); -// conj_if_complex(in_conj.data(), local_size * mu); -// this->mvprod_local_to_local(in_conj.data(), out, mu); -// conj_if_complex(out, mu * local_size_source); -// return; -// } -// bool need_delete = false; -// if (work == nullptr) { -// work = new T[(this->nc + sizeWorld * this->get_source_cluster()->get_local_size()) * mu]; -// need_delete = true; -// } - -// if (!(cluster_tree_s->IsLocal()) || !(cluster_tree_t->IsLocal())) { -// throw std::logic_error("[Htool error] Permutation is not local, mvprod_local_to_local cannot be used"); // LCOV_EXCL_LINE -// } - -// if (mu == 1) { -// std::vector in_perm(local_size), out_perm(local_size_source); - -// // local permutation -// if (use_permutation) { -// this->local_target_to_local_cluster(in, in_perm.data()); - -// // prod -// mymvprod_transp_local_to_local(in_perm.data(), out_perm.data(), 1, work); - -// // permutation -// this->local_cluster_to_local_source(out_perm.data(), out, comm); - -// } else { -// mymvprod_transp_local_to_local(in, out, 1, work); -// } - -// } else { - -// std::vector in_perm(local_size * mu); -// std::vector out_perm(local_size_source * mu); -// std::vector buffer(std::max(local_size_source, local_size)); - -// for (int i = 0; i < mu; i++) { -// // local permutation -// if (use_permutation) { -// this->local_target_to_local_cluster(in + i * local_size, buffer.data()); - -// // Transpose -// for (int j = 0; j < local_size; j++) { -// in_perm[i + j * mu] = buffer[j]; -// } -// } else { -// // Transpose -// for (int j = 0; j < local_size; j++) { -// in_perm[i + j * mu] = in[j + i * local_size]; -// } -// } -// } -// // It should never happen since we use mvprod_global_to_global in this case -// // if (m_symmetry == 'H') { -// // conj_if_complex(in_perm.data(), local_size_source * mu); -// // } - -// mymvprod_transp_local_to_local(in_perm.data(), out_perm.data(), mu, work); - -// for (int i = 0; i < mu; i++) { -// if (use_permutation) { -// // Tranpose -// for (int j = 0; j < local_size_source; j++) { -// buffer[j] = out_perm[i + j * mu]; -// } - -// // local permutation -// this->local_cluster_to_local_source(buffer.data(), out + i * local_size_source); -// } else { -// // Tranpose -// for (int j = 0; j < local_size_source; j++) { -// out[j + i * local_size_source] = out_perm[i + j * mu]; -// } -// } -// } -// // It should never happen since we use mvprod_global_to_global in this case -// // if (m_symmetry == 'H') { -// // conj_if_complex(out, out_perm.size()); -// // } -// } - -// if (need_delete) { -// delete[] work; -// work = nullptr; -// } -// // Timing -// infos["nb_mat_vec_prod"] = NbrToStr(1 + StrToNbr(infos["nb_mat_vec_prod"])); -// infos["total_time_mat_vec_prod"] = NbrToStr(MPI_Wtime() - time + StrToNbr(infos["total_time_mat_vec_prod"])); -// } - -// template -// void HMatrix::mvprod_subrhs(const T *const in, T *const out, const int &mu, const int &offset, const int &size, const int &margin) const { -// std::fill(out, out + local_size * mu, 0); - -// // Contribution champ lointain -// #if defined(_OPENMP) -// # pragma omp parallel -// #endif -// { -// std::vector temp(local_size * mu, 0); -// // To localize the rhs with multiple rhs, it is transpose. So instead of A*B, we do transpose(B)*transpose(A) -// char transb = 'T'; -// // In case of a hermitian matrix, the rhs is conjugate transpose -// if (m_symmetry == 'H') { -// transb = 'C'; -// } -// #if defined(_OPENMP) -// # pragma omp for schedule(guided) -// #endif -// for (int b = 0; b < MyComputedBlocks.size(); b++) { -// int offset_i = MyComputedBlocks[b]->get_target_cluster()->get_offset(); -// int offset_j = MyComputedBlocks[b]->get_source_cluster()->get_offset(); -// int size_j = MyComputedBlocks[b]->get_source_cluster()->get_size(); - -// if ((offset_j < offset + size && offset < offset_j + size_j) && (m_symmetry == 'N' || offset_i != offset_j)) { -// if (offset_j - offset < 0) { -// std::cout << "TEST " -// << " " << offset_j << " " << size_j << " " -// << " " << offset << " " << size << " " -// << " " << rankWorld << " " -// << offset_j - offset << std::endl; -// } -// MyComputedBlocks[b]->get_block_data()->add_mvprod_row_major(in + (offset_j - offset + margin) * mu, temp.data() + (offset_i - local_offset) * mu, mu, transb); -// } -// } - -// // Symmetry part of the diagonal part -// if (m_symmetry != 'N') { -// transb = 'N'; -// char op_sym = 'T'; -// if (m_symmetry == 'H') { -// op_sym = 'C'; -// } -// #if defined(_OPENMP) -// # pragma omp for schedule(guided) -// #endif -// for (int b = 0; b < MyDiagComputedBlocks.size(); b++) { -// int offset_i = MyDiagComputedBlocks[b]->get_source_cluster()->get_offset(); -// int offset_j = MyDiagComputedBlocks[b]->get_target_cluster()->get_offset(); -// int size_j = MyDiagComputedBlocks[b]->get_target_cluster()->get_size(); - -// if ((offset_j < offset + size && offset < offset_j + size_j) && offset_i != offset_j) { // remove strictly diagonal blocks -// MyDiagComputedBlocks[b]->get_block_data()->add_mvprod_row_major(in + (offset_j - offset + margin) * mu, temp.data() + (offset_i - local_offset) * mu, mu, transb, op_sym); -// } -// } - -// #if defined(_OPENMP) -// # pragma omp for schedule(guided) -// #endif -// for (int b = 0; b < MyStrictlyDiagNearFieldMats.size(); b++) { -// const Block *M = MyStrictlyDiagNearFieldMats[b]; -// int offset_i = M->get_source_cluster()->get_offset(); -// int offset_j = M->get_target_cluster()->get_offset(); -// int size_j = M->get_source_cluster()->get_size(); -// if (offset_j < offset + size && offset < offset_j + size_j) { -// M->get_dense_block_data()->add_mvprod_row_major_sym(in + (offset_j - offset + margin) * mu, temp.data() + (offset_i - local_offset) * mu, mu, this->UPLO, this->symmetry); -// } -// } -// } -// #if defined(_OPENMP) -// # pragma omp critical -// #endif -// std::transform(temp.begin(), temp.end(), out, out, std::plus()); -// } - -// if (!(this->cluster_tree_s->get_local_offset() < offset + size && offset < this->cluster_tree_s->get_local_offset() + this->cluster_tree_s->get_local_size()) && this->OffDiagonalApproximation != nullptr) { -// std::vector off_diagonal_out(cluster_tree_t->get_local_size() * mu, 0); -// int off_diagonal_offset = (offset < this->cluster_tree_s->get_local_offset()) ? offset : offset - this->cluster_tree_s->get_local_size(); -// if (mu > 1 && !this->OffDiagonalApproximation->IsUsingRowMajorStorage()) { // Need to transpose input and output for OffDiagonalApproximation -// std::vector off_diagonal_input_column_major(size * mu, 0); - -// for (int i = 0; i < mu; i++) { -// for (int j = 0; j < size; j++) { -// off_diagonal_input_column_major[j + i * size] = in[i + j * mu]; -// } -// } - -// if (m_symmetry == 'H') { -// conj_if_complex(off_diagonal_input_column_major.data(), size * mu); -// } - -// this->OffDiagonalApproximation->mvprod_subrhs_to_local(off_diagonal_input_column_major.data(), off_diagonal_out.data(), mu, off_diagonal_offset, size); - -// if (m_symmetry == 'H') { -// conj_if_complex(off_diagonal_out.data(), off_diagonal_out.size()); -// } -// for (int i = 0; i < mu; i++) { -// for (int j = 0; j < local_size; j++) { -// out[i + j * mu] += off_diagonal_out[i * local_size + j]; -// } -// } -// } else { -// this->OffDiagonalApproximation->mvprod_subrhs_to_local(in, off_diagonal_out.data(), mu, off_diagonal_offset, size); - -// int incx(1), incy(1), local_size_rhs(local_size * mu); -// T da(1); - -// Blas::axpy(&local_size_rhs, &da, off_diagonal_out.data(), &incx, out, &incy); -// } -// } -// } - -// template -// void HMatrix::source_to_cluster_permutation(const T *const in, T *const out) const { -// global_to_cluster(cluster_tree_m_source_cluster_tree->get(), in, out); -// } - -// template -// void HMatrix::target_to_cluster_permutation(const T *const in, T *const out) const { -// global_to_cluster(cluster_tree_m_target_cluster_tree->get(), in, out); -// } - -// template -// void HMatrix::cluster_to_target_permutation(const T *const in, T *const out) const { -// cluster_to_global(cluster_tree_m_target_cluster_tree->get(), in, out); -// } - -// template -// void HMatrix::cluster_to_source_permutation(const T *const in, T *const out) const { -// cluster_to_global(cluster_tree_m_source_cluster_tree->get(), in, out); -// } - -// template -// void HMatrix::local_target_to_local_cluster(const T *const in, T *const out, MPI_Comm comm) const { -// local_to_local_cluster(cluster_tree_m_target_cluster_tree->get(), in, out, comm); -// } - -// template -// void HMatrix::local_source_to_local_cluster(const T *const in, T *const out, MPI_Comm comm) const { -// local_to_local_cluster(cluster_tree_m_source_cluster_tree->get(), in, out, comm); -// } - -// template -// void HMatrix::local_cluster_to_local_target(const T *const in, T *const out, MPI_Comm comm) const { -// local_cluster_to_local(cluster_tree_m_target_cluster_tree->get(), in, out, comm); -// } - -// template -// void HMatrix::local_cluster_to_local_source(const T *const in, T *const out, MPI_Comm comm) const { -// local_cluster_to_local(cluster_tree_m_source_cluster_tree->get(), in, out, comm); -// } - -// template -// std::vector HMatrix::operator*(const std::vector &x) const { -// assert(x.size() == nc); -// std::vector result(nr, 0); -// mvprod_global_to_global(x.data(), result.data(), 1); -// return result; -// } - -// template -// underlying_type HMatrix::compressed_size() const { - -// double my_compressed_size = 0.; -// double nr_b, nc_b, rank; -// const auto &low_rank_leaves = m_block_tree_properties->m_low_rank_leaves; -// const auto &dense_leaves = m_block_tree_properties->m_dense_leaves; -// std::cout << dense_leaves.size() << " " << low_rank_leaves.size() << "\n"; -// for (int j = 0; j < low_rank_leaves.size(); j++) { -// nr_b = low_rank_leaves[j]->get_target_cluster_tree().get_size(); -// nc_b = low_rank_leaves[j]->get_source_cluster_tree().get_size(); -// rank = low_rank_leaves[j]->get_low_rank_data()->rank_of(); -// my_compressed_size += rank * (nr_b + nc_b); -// } - -// for (int j = 0; j < dense_leaves.size(); j++) { -// nr_b = dense_leaves[j]->get_target_cluster_tree().get_size(); -// nc_b = dense_leaves[j]->get_source_cluster_tree().get_size(); -// if (dense_leaves[j]->get_target_cluster_tree().get_offset() == dense_leaves[j]->get_source_cluster_tree().get_offset() && m_block_tree_properties->m_symmetry != 'N' && nr_b == nc_b) { -// my_compressed_size += (nr_b * (nc_b + 1)) / 2; -// } else { -// my_compressed_size += nr_b * nc_b; -// } -// } - -// return my_compressed_size; -// } - -// template -// void HMatrix::print_infos() const { -// int rankWorld; -// MPI_Comm_rank(comm, &rankWorld); - -// if (rankWorld == 0) { -// for (std::map::const_iterator it = infos.begin(); it != infos.end(); ++it) { -// std::cout << it->first << "\t" << it->second << std::endl; -// } -// std::cout << std::endl; -// } -// } - -// template -// void HMatrix::save_infos(const std::string &outputname, std::ios_base::openmode mode, const std::string &sep) const { -// int rankWorld; -// MPI_Comm_rank(comm, &rankWorld); - -// if (rankWorld == 0) { -// std::ofstream outputfile(outputname, mode); -// if (outputfile) { -// for (std::map::const_iterator it = infos.begin(); it != infos.end(); ++it) { -// outputfile << it->first << sep << it->second << std::endl; -// } -// outputfile.close(); -// } else { -// std::cout << "Unable to create " << outputname << std::endl; // LCOV_EXCL_LINE -// } -// } -// } - -// template -// void HMatrix::save_plot(const std::string &outputname) const { - -// std::ofstream outputfile((outputname + ".csv").c_str()); - -// if (outputfile) { -// const auto &output = get_output(); -// outputfile << m_block_tree_properties->m_root_cluster_tree_target->get_size() << "," << m_block_tree_properties->m_root_cluster_tree_source->get_size() << std::endl; -// for (const auto &block : output) { -// outputfile << block << "\n"; -// } -// outputfile.close(); -// } else { -// std::cout << "Unable to create " << outputname << std::endl; // LCOV_EXCL_LINE -// } -// } - -// template -// std::vector HMatrix::get_output() const { - -// std::vector output(m_block_tree_properties->m_tasks.size()); -// int index = 0; -// for (const auto &task : m_block_tree_properties->m_tasks) { -// output[index].target_offset = taskm_target_clusterree().get_offset() - m_block_tree_properties->m_root_cluster_tree_target->get_offset(); -// output[index].target_size = taskm_target_clusterree().get_size(); -// output[index].source_offset = task->get_m_t.().get_offset() - m_block_tree_properties->m_root_cluster_tree_source->get_offset(); -// output[index].source_size = task->get_m_t.().get_size(); -// output[index].rank = task->is_dense() ? -1 : task->m_low_rank_data->rank_of(); -// index++; -// } - -// return output; -// } - -// template -// underlying_type Frobenius_absolute_error(const HMatrix &B, const VirtualGenerator &A) { -// underlying_type myerr = 0; -// for (int j = 0; j < B.MyFarFieldMats.size(); j++) { -// underlying_type test = Frobenius_absolute_error(*(B.MyFarFieldMats[j]), *(B.MyFarFieldMats[j]->get_low_rank_block_data()), A); -// myerr += std::pow(test, 2); -// } - -// underlying_type err = 0; -// MPI_Allreduce(&myerr, &err, 1, wrapper_mpi::mpi_underlying_type(), MPI_SUM, B.comm); - -// return std::sqrt(err); -// } - -// template -// Matrix HMatrix::get_local_dense() const { -// Matrix Dense(local_size, nc); -// // Internal dense blocks -// for (int l = 0; l < MyNearFieldMats.size(); l++) { -// const Block *submat = MyNearFieldMats[l]; -// int local_nr = submat->get_target_cluster()->get_size(); -// int local_nc = submat->get_source_cluster()->get_size(); -// int offset_i = submat->get_target_cluster()->get_offset(); -// int offset_j = submat->get_source_cluster()->get_offset(); -// for (int k = 0; k < local_nc; k++) { -// std::copy_n(&(submat->get_dense_block_data()->operator()(0, k)), local_nr, Dense.data() + (offset_i - local_offset) + (offset_j + k) * local_size); -// } -// } - -// // Internal compressed block -// for (int l = 0; l < MyFarFieldMats.size(); l++) { -// const Block *lmat = MyFarFieldMats[l]; -// int local_nr = lmat->get_target_cluster()->get_size(); -// int local_nc = lmat->get_source_cluster()->get_size(); -// int offset_i = lmat->get_target_cluster()->get_offset(); -// int offset_j = lmat->get_source_cluster()->get_offset(); -// Matrix FarFielBlock(local_nr, local_nc); -// lmat->get_low_rank_block_data()->get_whole_matrix(&(FarFielBlock(0, 0))); -// for (int k = 0; k < local_nc; k++) { -// std::copy_n(&(FarFielBlock(0, k)), local_nr, Dense.data() + (offset_i - local_offset) + (offset_j + k) * local_size); -// } -// } -// return Dense; -// } - -// template -// Matrix HMatrix::get_local_dense_perm() const { -// Matrix Dense(local_size, nc); -// copy_local_dense_perm(Dense.data()); -// return Dense; -// } - template void copy_to_dense(const HMatrix &hmatrix, CoefficientPrecision *ptr) { @@ -2138,153 +443,5 @@ void copy_diagonal_in_user_numbering(const HMatrix -// Matrix HMatrix::get_local_interaction(bool permutation) const { -// int local_size_source = cluster_tree_s->get_masteroffset(rankWorld).second; -// Matrix local_interaction(local_size, local_size_source); -// copy_local_interaction(local_interaction.data(), permutation); -// return local_interaction; -// } - -// template -// Matrix HMatrix::get_local_diagonal_block(bool permutation) const { -// int local_size_source = cluster_tree_s->get_masteroffset(rankWorld).second; -// Matrix diagonal_block(local_size, local_size_source); -// copy_local_diagonal_block(diagonal_block.data(), permutation); -// return diagonal_block; -// } - -// template -// void HMatrix::copy_local_interaction(T *ptr, bool permutation) const { -// if ((!(cluster_tree_t->IsLocal()) || !(cluster_tree_s->IsLocal())) && permutation) { -// throw std::logic_error("[Htool error] Permutation is not local, get_local_interaction cannot be used"); // LCOV_EXCL_LINE -// } - -// int local_offset_source = cluster_tree_s->get_masteroffset(rankWorld).first; -// int local_size_source = cluster_tree_s->get_masteroffset(rankWorld).second; -// // Internal dense blocks -// for (int i = 0; i < MyDiagNearFieldMats.size(); i++) { -// const Block *submat = MyDiagNearFieldMats[i]; -// int local_nr = submat->get_target_cluster()->get_size(); -// int local_nc = submat->get_source_cluster()->get_size(); -// int offset_i = submat->get_target_cluster()->get_offset() - local_offset; -// int offset_j = submat->get_source_cluster()->get_offset() - local_offset_source; -// for (int i = 0; i < local_nc; i++) { -// std::copy_n(&(submat->get_dense_block_data()->operator()(0, i)), local_nr, ptr + offset_i + (offset_j + i) * local_size); -// } -// } - -// // Internal compressed block -// for (int i = 0; i < MyDiagFarFieldMats.size(); i++) { -// const Block *lmat = MyDiagFarFieldMats[i]; -// int local_nr = lmat->get_target_cluster()->get_size(); -// int local_nc = lmat->get_source_cluster()->get_size(); -// int offset_i = lmat->get_target_cluster()->get_offset() - local_offset; -// int offset_j = lmat->get_source_cluster()->get_offset() - local_offset_source; -// ; -// Matrix FarFielBlock(local_nr, local_nc); -// lmat->get_low_rank_block_data()->get_whole_matrix(&(FarFielBlock(0, 0))); -// for (int i = 0; i < local_nc; i++) { -// std::copy_n(&(FarFielBlock(0, i)), local_nr, ptr + offset_i + (offset_j + i) * local_size); -// } -// } - -// // Asking for permutation while symmetry!=N means that the block is upper/lower triangular in Htool's numbering, but it is not true in User's numbering - -// if (permutation && m_symmetry != 'N') { -// if (UPLO == 'L' && m_symmetry == 'S') { -// for (int i = 0; i < local_size; i++) { -// for (int j = 0; j < i; j++) { -// ptr[j + i * local_size] = ptr[i + j * local_size]; -// } -// } -// } - -// if (UPLO == 'U' && m_symmetry == 'S') { -// for (int i = 0; i < local_size; i++) { -// for (int j = i + 1; j < local_size_source; j++) { -// ptr[j + i * local_size] = ptr[i + j * local_size]; -// } -// } -// } -// if (UPLO == 'L' && m_symmetry == 'H') { -// for (int i = 0; i < local_size; i++) { -// for (int j = 0; j < i; j++) { -// ptr[j + i * local_size] = conj_if_complex(ptr[i + j * local_size]); -// } -// } -// } - -// if (UPLO == 'U' && m_symmetry == 'H') { -// for (int i = 0; i < local_size; i++) { -// for (int j = i + 1; j < local_size_source; j++) { -// ptr[j + i * local_size] = conj_if_complex(ptr[i + j * local_size]); -// } -// } -// } -// } -// // Permutations -// if (permutation) { -// Matrix diagonal_block_perm(local_size, local_size_source); -// for (int i = 0; i < local_size; i++) { -// for (int j = 0; j < local_size_source; j++) { -// diagonal_block_perm(i, cluster_tree_s->get_global_perm(j + local_offset_source) - local_offset_source) = ptr[i + j * local_size]; -// } -// } - -// for (int i = 0; i < local_size; i++) { -// this->local_cluster_to_local_target(diagonal_block_perm.data() + i * local_size, ptr + i * local_size, comm); -// } -// } -// } - -// template -// void HMatrix::copy_local_diagonal_block(T *ptr, bool permutation) const { -// if (cluster_tree_t != cluster_tree_s) { -// throw std::logic_error("[Htool error] Matrix is not square a priori, get_local_diagonal_block cannot be used"); // LCOV_EXCL_LINE -// } -// copy_local_interaction(ptr, permutation); -// } - -// template -// std::pair HMatrix::get_max_size_blocks() const { -// int local_max_size_j = 0; -// int local_max_size_i = 0; - -// for (int i = 0; i < MyFarFieldMats.size(); i++) { -// if (local_max_size_j < MyFarFieldMats[i]->get_source_cluster()->get_size()) -// local_max_size_j = MyFarFieldMats[i]->get_source_cluster()->get_size(); -// if (local_max_size_i < MyFarFieldMats[i]->get_target_cluster()->get_size()) -// local_max_size_i = MyFarFieldMats[i]->get_target_cluster()->get_size(); -// } -// for (int i = 0; i < MyNearFieldMats.size(); i++) { -// if (local_max_size_j < MyNearFieldMats[i]->get_source_cluster()->get_size()) -// local_max_size_j = MyNearFieldMats[i]->get_source_cluster()->get_size(); -// if (local_max_size_i < MyNearFieldMats[i]->get_target_cluster()->get_size()) -// local_max_size_i = MyNearFieldMats[i]->get_target_cluster()->get_size(); -// } - -// return std::pair(local_max_size_i, local_max_size_j); -// } - -// template -// void HMatrix::apply_dirichlet(const std::vector &boundary) { -// // Renum -// std::vector boundary_renum(boundary.size()); -// this->source_to_cluster_permutation(boundary.data(), boundary_renum.data()); - -// // -// for (int j = 0; j < MyStrictlyDiagNearFieldMats.size(); j++) { -// SubMatrix &submat = *(MyStrictlyDiagNearFieldMats[j]); -// int local_nr = submat.nb_rows(); -// int local_nc = submat.nb_cols(); -// int offset_i = submam_target_cluster_tree->get_offset_i(); -// for (int i = offset_i; i < offset_i + std::min(local_nr, local_nc); i++) { -// if (boundary_renum[i]) -// submat(i - offset_i, i - offset_i) = 1e30; -// } -// } -// } - } // namespace htool #endif diff --git a/include/htool/hmatrix/linalg/add_hmatrix_matrix_product.hpp b/include/htool/hmatrix/linalg/add_hmatrix_matrix_product.hpp index 1d992a1a..4efcb6b7 100644 --- a/include/htool/hmatrix/linalg/add_hmatrix_matrix_product.hpp +++ b/include/htool/hmatrix/linalg/add_hmatrix_matrix_product.hpp @@ -17,7 +17,14 @@ void add_hmatrix_matrix_product(char transa, char transb, CoefficientPrecision a } else { Matrix transposed_C(C.nb_cols(), C.nb_rows()); transpose(C, transposed_C); - sequential_add_hmatrix_matrix_product_row_major(transa, 'N', alpha, A, B.data(), beta, transposed_C.data(), transposed_C.nb_rows()); + std::vector buffer_B(transb == 'C' ? B.nb_cols() * B.nb_rows() : 0); + + if (transb == 'C') { + std::copy(B.data(), B.data() + B.nb_cols() * B.nb_rows(), buffer_B.data()); + conj_if_complex(buffer_B.data(), buffer_B.size()); + } + + sequential_add_hmatrix_matrix_product_row_major(transa, 'N', alpha, A, transb == 'C' ? buffer_B.data() : B.data(), beta, transposed_C.data(), transposed_C.nb_rows()); transpose(transposed_C, C); } } diff --git a/include/htool/hmatrix/linalg/add_hmatrix_matrix_product_row_major.hpp b/include/htool/hmatrix/linalg/add_hmatrix_matrix_product_row_major.hpp index f9d283b1..2b864a03 100644 --- a/include/htool/hmatrix/linalg/add_hmatrix_matrix_product_row_major.hpp +++ b/include/htool/hmatrix/linalg/add_hmatrix_matrix_product_row_major.hpp @@ -2,9 +2,50 @@ #define HTOOL_HMATRIX_LINALG_ADD_HMATRIX_MATRIX_PRODUCT_ROW_MAJOR_HPP #include "../hmatrix.hpp" +#include "../lrmat/linalg/add_lrmat_matrix_product_row_major.hpp" namespace htool { +template +void add_hmatrix_matrix_product_row_major(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out, int mu) { + switch (A.get_storage_type()) { + case HMatrix::StorageType::Dense: + if (A.get_symmetry() == 'N') { + add_matrix_matrix_product_row_major(transa, transb, alpha, *A.get_dense_data(), in, beta, out, mu); + } else if (A.get_symmetry() == 'S') { + add_symmetric_matrix_matrix_product_row_major('L', A.get_UPLO(), alpha, *A.get_dense_data(), in, beta, out, mu); + } + break; + case HMatrix::StorageType::LowRank: + add_lrmat_matrix_product_row_major(transa, transb, alpha, *A.get_low_rank_data(), in, beta, out, mu); + break; + default: + sequential_add_hmatrix_matrix_product_row_major(transa, transb, alpha, A, in, beta, out, mu); + break; + } +} + +template +void add_hmatrix_matrix_product_row_major(char transa, char transb, std::complex alpha, const HMatrix, CoordinatePrecision> &A, const std::complex *in, std::complex beta, std::complex *out, int mu) { + switch (A.get_storage_type()) { + case HMatrix, CoordinatePrecision>::StorageType::Dense: + if (A.get_symmetry() == 'N') { + add_matrix_matrix_product_row_major(transa, transb, alpha, *A.get_dense_data(), in, beta, out, mu); + } else if (A.get_symmetry() == 'S') { + add_symmetric_matrix_matrix_product_row_major('L', A.get_UPLO(), alpha, *A.get_dense_data(), in, beta, out, mu); + } else { + add_hermitian_matrix_matrix_product_row_major('L', A.get_UPLO(), alpha, *A.get_dense_data(), in, beta, out, mu); + } + break; + case HMatrix, CoordinatePrecision>::StorageType::LowRank: + add_lrmat_matrix_product_row_major(transa, transb, alpha, *A.get_low_rank_data(), in, beta, out, mu); + break; + default: + sequential_add_hmatrix_matrix_product_row_major(transa, transb, alpha, A, in, beta, out, mu); + break; + } +} + template void sequential_add_hmatrix_matrix_product_row_major(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *B, CoefficientPrecision beta, CoefficientPrecision *C, int mu) { auto &leaves = A.get_leaves(); @@ -129,88 +170,69 @@ void openmp_add_hmatrix_matrix_product_row_major(char transa, char transb, Coeff } } -template -void add_hmatrix_matrix_product_row_major(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out, int mu) { - switch (A.get_storage_type()) { - case HMatrix::StorageType::Dense: - if (A.get_symmetry() == 'N') { - A.get_dense_data()->add_matrix_product_row_major(transa, alpha, in, beta, out, mu); - } else { - A.get_dense_data()->add_matrix_product_symmetric_row_major(transa, alpha, in, beta, out, mu, A.get_UPLO(), A.get_symmetry()); - } - break; - case HMatrix::StorageType::LowRank: - A.get_low_rank_data()->add_matrix_product_row_major(transa, alpha, in, beta, out, mu); - break; - default: - sequential_add_hmatrix_matrix_product_row_major(transa, transb, alpha, A, in, beta, out, mu); - break; - } -} - -template -void sequential_add_symmetric_hmatrix_matrix_product_row_major(char side, char UPLO, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *B, CoefficientPrecision beta, CoefficientPrecision *C, int mu) { - auto &leaves = A.get_leaves(); - auto &leaves_for_symmetry = A.get_leaves_for_symmetry(); - - if (side != 'L') { - htool::Logger::get_instance().log(LogLevel::ERROR, "Operation is not implemented for sequential_add_symmetric_hmatrix_matrix_product_row_major (side=" + std::string(1, side) + ")"); // LCOV_EXCL_LINE - } - - int out_size(A.get_target_cluster().get_size() * mu); - auto get_output_cluster{&HMatrix::get_target_cluster}; - auto get_input_cluster{&HMatrix::get_source_cluster}; - int local_output_offset = A.get_target_cluster().get_offset(); - int local_input_offset = A.get_source_cluster().get_offset(); - char trans_sym = 'T'; - - int incx(1), incy(1); - if (CoefficientPrecision(beta) != CoefficientPrecision(1)) { - // TODO: use blas - std::transform(C, C + out_size, C, [&beta](CoefficientPrecision &c) { return c * beta; }); - } - - // Contribution champ lointain - std::vector temp(out_size, 0); - for (int b = 0; b < leaves.size(); b++) { - int input_offset = (leaves[b]->*get_input_cluster)().get_offset(); - int output_offset = (leaves[b]->*get_output_cluster)().get_offset(); - if (input_offset == output_offset) { - add_symmetric_hmatrix_matrix_product_row_major(side, UPLO, CoefficientPrecision(1), *leaves[b], B + (input_offset - local_input_offset) * mu, CoefficientPrecision(1), temp.data() + (output_offset - local_output_offset) * mu, mu); - } else { - add_hmatrix_matrix_product_row_major('N', 'N', CoefficientPrecision(1), *leaves[b], B + (input_offset - local_input_offset) * mu, CoefficientPrecision(1), temp.data() + (output_offset - local_output_offset) * mu, mu); - } - } - - // Symmetry part of the diagonal part - for (int b = 0; b < leaves_for_symmetry.size(); b++) { - int input_offset = (leaves_for_symmetry[b]->*get_input_cluster)().get_offset(); - int output_offset = (leaves_for_symmetry[b]->*get_output_cluster)().get_offset(); - add_hmatrix_matrix_product_row_major(trans_sym, 'N', CoefficientPrecision(1), *leaves_for_symmetry[b], B + (output_offset - local_input_offset) * mu, CoefficientPrecision(1), temp.data() + (input_offset - local_output_offset) * mu, mu); - } - Blas::axpy(&out_size, &alpha, temp.data(), &incx, C, &incy); -} - -template -void add_symmetric_hmatrix_matrix_product_row_major(char side, char UPLO, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out, int mu) { - switch (A.get_storage_type()) { - case HMatrix::StorageType::Dense: - add_symmetric_matrix_matrix_product_row_major(side, UPLO, alpha, *A.get_dense_data(), in, beta, out, mu); - break; - case HMatrix::StorageType::LowRank: - htool::Logger::get_instance().log(LogLevel::ERROR, "Operation is not implemented for add_symmetric_hmatrix_matrix_product_row_major (symmetric lrmat?)"); // LCOV_EXCL_LINE - // add_symmetric_matrix_product_row_major(side, UPLO, alpha, *A.get_low_rank_data(), in, beta, out, mu); - break; - default: - if (side == 'L') { - sequential_add_symmetric_hmatrix_matrix_product_row_major(side, UPLO, alpha, A, in, beta, out, mu); - } else { - // - sequential_add_symmetric_hmatrix_matrix_product_row_major(side, UPLO, alpha, A, in, beta, out, mu); - } - break; - } -} +// template +// void sequential_add_symmetric_hmatrix_matrix_product_row_major(char side, char UPLO, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *B, CoefficientPrecision beta, CoefficientPrecision *C, int mu) { +// auto &leaves = A.get_leaves(); +// auto &leaves_for_symmetry = A.get_leaves_for_symmetry(); + +// if (side != 'L') { +// htool::Logger::get_instance().log(LogLevel::ERROR, "Operation is not implemented for sequential_add_symmetric_hmatrix_matrix_product_row_major (side=" + std::string(1, side) + ")"); // LCOV_EXCL_LINE +// } + +// int out_size(A.get_target_cluster().get_size() * mu); +// auto get_output_cluster{&HMatrix::get_target_cluster}; +// auto get_input_cluster{&HMatrix::get_source_cluster}; +// int local_output_offset = A.get_target_cluster().get_offset(); +// int local_input_offset = A.get_source_cluster().get_offset(); +// char trans_sym = 'T'; + +// int incx(1), incy(1); +// if (CoefficientPrecision(beta) != CoefficientPrecision(1)) { +// // TODO: use blas +// std::transform(C, C + out_size, C, [&beta](CoefficientPrecision &c) { return c * beta; }); +// } + +// // Contribution champ lointain +// std::vector temp(out_size, 0); +// for (int b = 0; b < leaves.size(); b++) { +// int input_offset = (leaves[b]->*get_input_cluster)().get_offset(); +// int output_offset = (leaves[b]->*get_output_cluster)().get_offset(); +// if (input_offset == output_offset) { +// add_symmetric_hmatrix_matrix_product_row_major(side, UPLO, CoefficientPrecision(1), *leaves[b], B + (input_offset - local_input_offset) * mu, CoefficientPrecision(1), temp.data() + (output_offset - local_output_offset) * mu, mu); +// } else { +// add_hmatrix_matrix_product_row_major('N', 'N', CoefficientPrecision(1), *leaves[b], B + (input_offset - local_input_offset) * mu, CoefficientPrecision(1), temp.data() + (output_offset - local_output_offset) * mu, mu); +// } +// } + +// // Symmetry part of the diagonal part +// for (int b = 0; b < leaves_for_symmetry.size(); b++) { +// int input_offset = (leaves_for_symmetry[b]->*get_input_cluster)().get_offset(); +// int output_offset = (leaves_for_symmetry[b]->*get_output_cluster)().get_offset(); +// add_hmatrix_matrix_product_row_major(trans_sym, 'N', CoefficientPrecision(1), *leaves_for_symmetry[b], B + (output_offset - local_input_offset) * mu, CoefficientPrecision(1), temp.data() + (input_offset - local_output_offset) * mu, mu); +// } +// Blas::axpy(&out_size, &alpha, temp.data(), &incx, C, &incy); +// } + +// template +// void add_symmetric_hmatrix_matrix_product_row_major(char side, char UPLO, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out, int mu) { +// switch (A.get_storage_type()) { +// case HMatrix::StorageType::Dense: +// add_symmetric_matrix_matrix_product_row_major(side, UPLO, alpha, *A.get_dense_data(), in, beta, out, mu); +// break; +// case HMatrix::StorageType::LowRank: +// htool::Logger::get_instance().log(LogLevel::ERROR, "Operation is not implemented for add_symmetric_hmatrix_matrix_product_row_major (symmetric lrmat?)"); // LCOV_EXCL_LINE +// // add_symmetric_matrix_product_row_major(side, UPLO, alpha, *A.get_low_rank_data(), in, beta, out, mu); +// break; +// default: +// if (side == 'L') { +// sequential_add_symmetric_hmatrix_matrix_product_row_major(side, UPLO, alpha, A, in, beta, out, mu); +// } else { +// // +// sequential_add_symmetric_hmatrix_matrix_product_row_major(side, UPLO, alpha, A, in, beta, out, mu); +// } +// break; +// } +// } } // namespace htool diff --git a/include/htool/hmatrix/linalg/add_hmatrix_vector_product.hpp b/include/htool/hmatrix/linalg/add_hmatrix_vector_product.hpp index ccda0d8c..5a70f65c 100644 --- a/include/htool/hmatrix/linalg/add_hmatrix_vector_product.hpp +++ b/include/htool/hmatrix/linalg/add_hmatrix_vector_product.hpp @@ -1,52 +1,96 @@ #ifndef HTOOL_HMATRIX_LINALG_ADD_HMATRIX_VECTOR_PRODUCT_HPP #define HTOOL_HMATRIX_LINALG_ADD_HMATRIX_VECTOR_PRODUCT_HPP +#include "../../matrix/linalg/add_matrix_matrix_product.hpp" +#include "../../matrix/linalg/add_matrix_vector_product.hpp" #include "../hmatrix.hpp" +#include "../lrmat/linalg/add_lrmat_matrix_product.hpp" +#include "../lrmat/linalg/add_lrmat_vector_product.hpp" namespace htool { -// template -// void sequential_add_hmatrix_vector_product(char trans, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out) { - -// int out_size(A.get_target_cluster().get_size()); -// auto get_output_cluster{&HMatrix::get_target_cluster}; -// auto get_input_cluster{&HMatrix::get_source_cluster}; -// int local_input_offset = A.get_source_cluster().get_offset(); -// int local_output_offset = A.get_target_cluster().get_offset(); -// char trans_sym = (A.get_symmetry_for_leaves() == 'S') ? 'T' : 'C'; - -// if (trans != 'N') { -// out_size = A.get_source_cluster().get_size(); -// get_input_cluster = &HMatrix::get_target_cluster; -// get_output_cluster = &HMatrix::get_source_cluster; -// local_input_offset = A.get_target_cluster().get_offset(); -// local_output_offset = A.get_source_cluster().get_offset(); -// trans_sym = 'N'; -// } - -// if (CoefficientPrecision(beta) != CoefficientPrecision(1)) { -// std::transform(out, out + out_size, out, [&beta](CoefficientPrecision &c) { return c * beta; }); -// } - -// // Contribution champ lointain -// std::vector temp(out_size, 0); -// // for (int b = 0; b < A.get_leaves().size(); b++) { -// for (auto &leaf : A.get_leaves()) { -// int input_offset = (leaf->*get_input_cluster)().get_offset(); -// int output_offset = (leaf->*get_output_cluster)().get_offset(); -// leaf->add_vector_product(trans, 1, in + input_offset - local_input_offset, alpha, out + (output_offset - local_output_offset)); -// } - -// // Symmetry part of the diagonal part -// if (A.get_symmetry_for_leaves() != 'N') { -// // for (int b = 0; b < A.get_leaves_for_symmetry().size(); b++) { -// for (auto &leaf_for_symmetry : A.get_leaves_for_symmetry()) { -// int input_offset = (leaf_for_symmetry->*get_input_cluster)().get_offset(); -// int output_offset = (leaf_for_symmetry->*get_output_cluster)().get_offset(); -// leaf_for_symmetry->add_vector_product(trans_sym, alpha, in + output_offset - local_input_offset, 1, out + (input_offset - local_output_offset)); -// } -// } -// } +template +void add_hmatrix_vector_product(char trans, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out) { + switch (A.get_storage_type()) { + case HMatrix::StorageType::Dense: + if (A.get_symmetry() == 'N') { + add_matrix_vector_product(trans, alpha, *A.get_dense_data(), in, beta, out); + } else if (A.get_symmetry() == 'S') { + add_symmetric_matrix_vector_product(A.get_UPLO(), alpha, *A.get_dense_data(), in, beta, out); + } + break; + case HMatrix::StorageType::LowRank: + add_lrmat_vector_product(trans, alpha, *A.get_low_rank_data(), in, beta, out); + break; + default: + sequential_add_hmatrix_vector_product(trans, alpha, A, in, beta, out); + break; + } +} + +template +void add_hmatrix_vector_product(char trans, std::complex alpha, const HMatrix, CoordinatePrecision> &A, const std::complex *in, std::complex beta, std::complex *out) { + switch (A.get_storage_type()) { + case HMatrix, CoordinatePrecision>::StorageType::Dense: + if (A.get_symmetry() == 'N') { + add_matrix_vector_product(trans, alpha, *A.get_dense_data(), in, beta, out); + } else if (A.get_symmetry() == 'S') { + add_symmetric_matrix_vector_product(A.get_UPLO(), alpha, *A.get_dense_data(), in, beta, out); + } else if (A.get_symmetry() == 'H') { + add_hermitian_matrix_vector_product(A.get_UPLO(), alpha, *A.get_dense_data(), in, beta, out); + } + break; + case HMatrix, CoordinatePrecision>::StorageType::LowRank: + add_lrmat_vector_product(trans, alpha, *A.get_low_rank_data(), in, beta, out); + break; + default: + sequential_add_hmatrix_vector_product(trans, alpha, A, in, beta, out); + break; + } +} + +template +void sequential_add_hmatrix_vector_product(char trans, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out) { + + int out_size(A.get_target_cluster().get_size()); + auto get_output_cluster{&HMatrix::get_target_cluster}; + auto get_input_cluster{&HMatrix::get_source_cluster}; + int local_input_offset = A.get_source_cluster().get_offset(); + int local_output_offset = A.get_target_cluster().get_offset(); + char trans_sym = (A.get_symmetry_for_leaves() == 'S') ? 'T' : 'C'; + + if (trans != 'N') { + out_size = A.get_source_cluster().get_size(); + get_input_cluster = &HMatrix::get_target_cluster; + get_output_cluster = &HMatrix::get_source_cluster; + local_input_offset = A.get_target_cluster().get_offset(); + local_output_offset = A.get_source_cluster().get_offset(); + trans_sym = 'N'; + } + + if (CoefficientPrecision(beta) != CoefficientPrecision(1)) { + std::transform(out, out + out_size, out, [&beta](CoefficientPrecision &c) { return c * beta; }); + } + + // Contribution champ lointain + std::vector temp(out_size, 0); + // for (int b = 0; b < A.get_leaves().size(); b++) { + for (auto &leaf : A.get_leaves()) { + int input_offset = (leaf->*get_input_cluster)().get_offset(); + int output_offset = (leaf->*get_output_cluster)().get_offset(); + add_hmatrix_vector_product(trans, alpha, *leaf, in + input_offset - local_input_offset, CoefficientPrecision(1), out + (output_offset - local_output_offset)); + } + + // Symmetry part of the diagonal part + if (A.get_symmetry_for_leaves() != 'N') { + // for (int b = 0; b < A.get_leaves_for_symmetry().size(); b++) { + for (auto &leaf_for_symmetry : A.get_leaves_for_symmetry()) { + int input_offset = (leaf_for_symmetry->*get_input_cluster)().get_offset(); + int output_offset = (leaf_for_symmetry->*get_output_cluster)().get_offset(); + add_hmatrix_vector_product(trans_sym, alpha, *leaf_for_symmetry, in + output_offset - local_input_offset, CoefficientPrecision(1), out + (input_offset - local_output_offset)); + } + } +} template void openmp_add_hmatrix_vector_product(char trans, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out) { @@ -89,7 +133,7 @@ void openmp_add_hmatrix_vector_product(char trans, CoefficientPrecision alpha, c for (int b = 0; b < leaves.size(); b++) { int input_offset = (leaves[b]->*get_input_cluster)().get_offset(); int output_offset = (leaves[b]->*get_output_cluster)().get_offset(); - leaves[b]->add_vector_product(trans, 1, in + input_offset - local_input_offset, 1, temp.data() + (output_offset - local_output_offset)); + add_hmatrix_vector_product(trans, CoefficientPrecision(1), *leaves[b], in + input_offset - local_input_offset, CoefficientPrecision(1), temp.data() + (output_offset - local_output_offset)); } // Symmetry part of the diagonal part @@ -100,7 +144,7 @@ void openmp_add_hmatrix_vector_product(char trans, CoefficientPrecision alpha, c for (int b = 0; b < leaves_for_symmetry.size(); b++) { int input_offset = (leaves_for_symmetry[b]->*get_input_cluster)().get_offset(); int output_offset = (leaves_for_symmetry[b]->*get_output_cluster)().get_offset(); - leaves_for_symmetry[b]->add_vector_product(trans_sym, 1, in + output_offset - local_input_offset, 1, temp.data() + (input_offset - local_output_offset)); + add_hmatrix_vector_product(trans_sym, CoefficientPrecision(1), *leaves_for_symmetry[b], in + output_offset - local_input_offset, CoefficientPrecision(1), temp.data() + (input_offset - local_output_offset)); } } @@ -111,24 +155,6 @@ void openmp_add_hmatrix_vector_product(char trans, CoefficientPrecision alpha, c } } -template -void add_hmatrix_vector_product(char trans, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out) { - switch (A.get_storage_type()) { - case HMatrix::StorageType::Dense: - if (A.get_symmetry() == 'N') { - A.get_dense_data()->add_vector_product(trans, alpha, in, beta, out); - } else { - A.get_dense_data()->add_vector_product_symmetric(trans, alpha, in, beta, out, A.get_UPLO(), A.get_symmetry()); - } - break; - case HMatrix::StorageType::LowRank: - A.get_low_rank_data()->add_vector_product(trans, alpha, in, beta, out); - break; - default: - openmp_add_hmatrix_vector_product(trans, alpha, A, in, beta, out); - break; - } -} } // namespace htool #endif diff --git a/include/htool/hmatrix/linalg/add_lrmat_hmatrix_product.hpp b/include/htool/hmatrix/linalg/add_lrmat_hmatrix_product.hpp index ad14c020..78787665 100644 --- a/include/htool/hmatrix/linalg/add_lrmat_hmatrix_product.hpp +++ b/include/htool/hmatrix/linalg/add_lrmat_hmatrix_product.hpp @@ -14,11 +14,11 @@ void add_lrmat_hmatrix_product(char transa, char transb, CoefficientPrecision al auto &V = A.get_V(); if (transa == 'N') { Matrix VB(V.nb_rows(), transb == 'N' ? B.nb_cols() : B.nb_rows()); - add_matrix_hmatrix_product(transa, transb, 1, V, B, 0, VB); + add_matrix_hmatrix_product(transa, transb, CoefficientPrecision(1), V, B, CoefficientPrecision(0), VB); add_matrix_matrix_product(transa, 'N', alpha, U, VB, beta, C); } else { Matrix UtB(V.nb_rows(), transb == 'N' ? B.nb_cols() : B.nb_rows()); - add_matrix_hmatrix_product(transa, transb, 1, U, B, 0, UtB); + add_matrix_hmatrix_product(transa, transb, CoefficientPrecision(1), U, B, CoefficientPrecision(0), UtB); add_matrix_matrix_product(transa, 'N', alpha, V, UtB, beta, C); } } diff --git a/include/htool/hmatrix/linalg/add_matrix_hmatrix_product.hpp b/include/htool/hmatrix/linalg/add_matrix_hmatrix_product.hpp index 4d3adf2a..2f53a632 100644 --- a/include/htool/hmatrix/linalg/add_matrix_hmatrix_product.hpp +++ b/include/htool/hmatrix/linalg/add_matrix_hmatrix_product.hpp @@ -10,12 +10,33 @@ template &A, const HMatrix &B, CoefficientPrecision beta, Matrix &C) { char new_transa = transb == 'N' ? 'T' : 'N'; + new_transa = B.get_symmetry_for_leaves() == 'H' and transb == 'N' ? 'N' : new_transa; + new_transa = B.get_symmetry_for_leaves() == 'S' and transb == 'N' ? 'N' : new_transa; if (transa == 'N') { - sequential_add_hmatrix_matrix_product_row_major(new_transa, 'N', alpha, B, A.data(), beta, C.data(), C.nb_rows()); + bool need_buffer_for_conj = transb == 'C' or (B.get_symmetry_for_leaves() == 'H' and transb == 'N'); + std::vector buffer_A(need_buffer_for_conj ? A.nb_cols() * A.nb_rows() : 0); + if (need_buffer_for_conj) { + std::copy(A.data(), A.data() + A.nb_cols() * A.nb_rows(), buffer_A.data()); + conj_if_complex(buffer_A.data(), buffer_A.size()); + conj_if_complex(C.data(), C.nb_rows() * C.nb_cols()); + } + sequential_add_hmatrix_matrix_product_row_major(new_transa, 'N', need_buffer_for_conj ? conj_if_complex(alpha) : alpha, B, need_buffer_for_conj ? buffer_A.data() : A.data(), need_buffer_for_conj ? conj_if_complex(beta) : beta, C.data(), C.nb_rows()); + if (need_buffer_for_conj) { + conj_if_complex(C.data(), C.nb_rows() * C.nb_cols()); + } } else { Matrix transposed_A(A.nb_cols(), A.nb_rows()); transpose(A, transposed_A); - sequential_add_hmatrix_matrix_product_row_major(new_transa, 'N', alpha, B, transposed_A.data(), beta, C.data(), C.nb_rows()); + if (transb == 'C') { + conj_if_complex(C.data(), C.nb_rows() * C.nb_cols()); + } + if ((transa == 'T' && transb == 'C') or (transa == 'C' and transb != 'C')) { + conj_if_complex(transposed_A.data(), transposed_A.nb_rows() * transposed_A.nb_cols()); + } + sequential_add_hmatrix_matrix_product_row_major(new_transa, 'N', transb == 'C' ? conj_if_complex(alpha) : alpha, B, transposed_A.data(), transb == 'C' ? conj_if_complex(beta) : beta, C.data(), C.nb_rows()); + if (transb == 'C') { + conj_if_complex(C.data(), C.nb_rows() * C.nb_cols()); + } } } diff --git a/include/htool/hmatrix/linalg/triangular_hmatrix_hmatrix_solve.hpp b/include/htool/hmatrix/linalg/triangular_hmatrix_hmatrix_solve.hpp index 3ceb1e6d..135410ec 100644 --- a/include/htool/hmatrix/linalg/triangular_hmatrix_hmatrix_solve.hpp +++ b/include/htool/hmatrix/linalg/triangular_hmatrix_hmatrix_solve.hpp @@ -107,7 +107,7 @@ void triangular_hmatrix_hmatrix_solve(char side, char UPLO, char transa, char di } } } // Forward, compute each block column one after the other - else if ((UPLO == 'U' && transa == 'N' && side == 'R') || (UPLO == 'L' && transa == 'T' && side == 'R')) { + else if ((UPLO == 'U' && transa == 'N' && side == 'R') || (UPLO == 'L' && transa != 'N' && side == 'R')) { for (auto input_it = input_clusters.begin(); input_it != input_clusters.end(); ++input_it) { auto &input_cluster_child = *input_it; @@ -132,7 +132,7 @@ void triangular_hmatrix_hmatrix_solve(char side, char UPLO, char transa, char di } } } // Backward, compute each block column one after the other - else if ((UPLO == 'L' && transa == 'N' && side == 'R') || (UPLO == 'U' && transa == 'T' && side == 'R')) { + else if ((UPLO == 'L' && transa == 'N' && side == 'R') || (UPLO == 'U' && transa != 'N' && side == 'R')) { for (auto input_it = input_clusters.rbegin(); input_it != input_clusters.rend(); ++input_it) { auto &input_cluster_child = *input_it; diff --git a/include/htool/hmatrix/linalg/triangular_hmatrix_matrix_solve.hpp b/include/htool/hmatrix/linalg/triangular_hmatrix_matrix_solve.hpp index 645a6499..dcc933ae 100644 --- a/include/htool/hmatrix/linalg/triangular_hmatrix_matrix_solve.hpp +++ b/include/htool/hmatrix/linalg/triangular_hmatrix_matrix_solve.hpp @@ -114,7 +114,13 @@ void triangular_hmatrix_matrix_solve(char side, char UPLO, char transa, char dia transpose(transposed_B, B); } else { char transposed_transa = transa == 'N' ? 'T' : 'N'; - triangular_hmatrix_matrix_solve_row_major('L', UPLO, transposed_transa, diag, alpha, A, B); + if (transa == 'C') { + conj_if_complex(B.data(), B.nb_rows() * B.nb_cols()); + } + triangular_hmatrix_matrix_solve_row_major('L', UPLO, transposed_transa, diag, transa == 'C' ? conj_if_complex(alpha) : alpha, A, B); + if (transa == 'C') { + conj_if_complex(B.data(), B.nb_rows() * B.nb_cols()); + } } } diff --git a/include/htool/local_operators/local_hmatrix.hpp b/include/htool/local_operators/local_hmatrix.hpp index 7578373c..ff6b4174 100644 --- a/include/htool/local_operators/local_hmatrix.hpp +++ b/include/htool/local_operators/local_hmatrix.hpp @@ -5,6 +5,9 @@ #include "../clustering/cluster_node.hpp" #include "../hmatrix/hmatrix.hpp" #include "../hmatrix/interfaces/virtual_generator.hpp" +#include "../hmatrix/linalg/add_hmatrix_matrix_product.hpp" +#include "../hmatrix/linalg/add_hmatrix_matrix_product_row_major.hpp" +#include "../hmatrix/linalg/add_hmatrix_vector_product.hpp" #include "../hmatrix/tree_builder/tree_builder.hpp" #include "../matrix/matrix.hpp" #include "local_operator.hpp" @@ -24,10 +27,10 @@ class LocalHMatrix final : public LocalOperator &get_hmatrix() const { return *m_data.get(); } }; diff --git a/include/htool/testing/generate_test_case.hpp b/include/htool/testing/generate_test_case.hpp index a44ab0a9..7bc3e23a 100644 --- a/include/htool/testing/generate_test_case.hpp +++ b/include/htool/testing/generate_test_case.hpp @@ -112,6 +112,10 @@ class TestCaseProduct { template class TestCaseSymmetricProduct { + std::unique_ptr operator_in_user_numbering_A = nullptr; + std::unique_ptr operator_in_user_numbering_B = nullptr; + std::unique_ptr operator_in_user_numbering_C = nullptr; + public: char side; char symmetry; @@ -126,9 +130,9 @@ class TestCaseSymmetricProduct { const Cluster> *root_cluster_B_output = nullptr; const Cluster> *root_cluster_C_input = nullptr; const Cluster> *root_cluster_C_output = nullptr; - std::unique_ptr operator_A = nullptr; - std::unique_ptr operator_B = nullptr; - std::unique_ptr operator_C = nullptr; + std::unique_ptr> operator_A = nullptr; + std::unique_ptr> operator_B = nullptr; + std::unique_ptr> operator_C = nullptr; int ni_A; int no_A; int ni_B; @@ -136,7 +140,7 @@ class TestCaseSymmetricProduct { int ni_C; int no_C; - TestCaseSymmetricProduct(int n1, int n2, underlying_type z_distance_A, char side0, char symmetry0, char UPLO0, int number_of_partition = -1) : side(side0), symmetry(symmetry0), UPLO(UPLO0), x1(3 * n1), x2(3 * n1) { + TestCaseSymmetricProduct(int n1, int n2, underlying_type z_distance_A, char side0, char symmetry0, char UPLO0, int number_of_partition = -1) : side(side0), symmetry(symmetry0), UPLO(UPLO0), x1(3 * n1), x2(3 * n2) { // Input sizes ni_A = n1; @@ -170,27 +174,30 @@ class TestCaseSymmetricProduct { } // Operators - operator_A = std::make_unique(3, no_A, ni_A, x1, x1, *root_cluster_1, *root_cluster_1, true, true); - root_cluster_A_output = root_cluster_1.get(); - root_cluster_A_input = root_cluster_1.get(); + operator_in_user_numbering_A = std::make_unique(3, x1, x1); + root_cluster_A_output = root_cluster_1.get(); + root_cluster_A_input = root_cluster_1.get(); if (side == 'L') { - operator_B = std::make_unique(3, no_B, ni_B, x1, x2, *root_cluster_1, *root_cluster_2, true, true); - root_cluster_B_output = root_cluster_1.get(); - root_cluster_B_input = root_cluster_2.get(); + operator_in_user_numbering_B = std::make_unique(3, x1, x2); + root_cluster_B_output = root_cluster_1.get(); + root_cluster_B_input = root_cluster_2.get(); - operator_C = std::make_unique(3, no_C, ni_C, x1, x2, *root_cluster_1, *root_cluster_2, true, true); - root_cluster_C_input = root_cluster_1.get(); - root_cluster_C_output = root_cluster_2.get(); + operator_in_user_numbering_C = std::make_unique(3, x1, x2); + root_cluster_C_output = root_cluster_1.get(); + root_cluster_C_input = root_cluster_2.get(); } else { - operator_B = std::make_unique(3, no_B, ni_B, x2, x1, *root_cluster_2, *root_cluster_1, true, true); - root_cluster_B_output = root_cluster_2.get(); - root_cluster_B_input = root_cluster_1.get(); + operator_in_user_numbering_B = std::make_unique(3, x2, x1); + root_cluster_B_output = root_cluster_2.get(); + root_cluster_B_input = root_cluster_1.get(); - operator_C = std::make_unique(3, no_C, ni_C, x2, x1, *root_cluster_2, *root_cluster_1, true, true); - root_cluster_C_input = root_cluster_2.get(); - root_cluster_C_output = root_cluster_1.get(); + operator_in_user_numbering_C = std::make_unique(3, x2, x1); + root_cluster_C_output = root_cluster_2.get(); + root_cluster_C_input = root_cluster_1.get(); } + operator_A = std::make_unique>(*operator_in_user_numbering_A, root_cluster_A_output->get_permutation().data(), root_cluster_A_input->get_permutation().data()); + operator_B = std::make_unique>(*operator_in_user_numbering_B, root_cluster_B_output->get_permutation().data(), root_cluster_B_input->get_permutation().data()); + operator_C = std::make_unique>(*operator_in_user_numbering_C, root_cluster_C_output->get_permutation().data(), root_cluster_C_input->get_permutation().data()); } }; diff --git a/tests/functional_tests/hmatrix/hmatrix_factorization/test_hmatrix_factorization_complex_double.cpp b/tests/functional_tests/hmatrix/hmatrix_factorization/test_hmatrix_factorization_complex_double.cpp index b0552133..9cc27dfe 100644 --- a/tests/functional_tests/hmatrix/hmatrix_factorization/test_hmatrix_factorization_complex_double.cpp +++ b/tests/functional_tests/hmatrix/hmatrix_factorization/test_hmatrix_factorization_complex_double.cpp @@ -18,6 +18,9 @@ int main(int, char *[]) { for (auto trans : {'N', 'T'}) { is_error = is_error || test_hmatrix_lu, GeneratorTestComplexHermitian>(trans, n1, n2, epsilon, margin); } + for (auto UPLO : {'L', 'U'}) { + is_error = is_error || test_hmatrix_cholesky, GeneratorTestComplexHermitian>(UPLO, n1, n2, epsilon, margin); + } } if (is_error) { diff --git a/tests/functional_tests/hmatrix/hmatrix_product/test_hmatrix_product_complex_double.cpp b/tests/functional_tests/hmatrix/hmatrix_product/test_hmatrix_product_complex_double.cpp index fe2fd0e3..9cc1bb99 100644 --- a/tests/functional_tests/hmatrix/hmatrix_product/test_hmatrix_product_complex_double.cpp +++ b/tests/functional_tests/hmatrix/hmatrix_product/test_hmatrix_product_complex_double.cpp @@ -1,6 +1,5 @@ #include "../test_hmatrix_product.hpp" #include -#include using namespace std; using namespace htool; @@ -8,36 +7,32 @@ using namespace htool; int main(int argc, char *argv[]) { MPI_Init(&argc, &argv); - bool is_error = false; - const double margin = 10; + bool is_error = false; + const std::array additional_lrmat_sum_tolerances{10., 10., 10., 10., 10.}; - for (auto epsilon : {1e-6, 1e-10}) { + for (auto epsilon : {1e-10}) { for (auto n1 : {200, 400}) { for (auto n3 : {100}) { for (auto use_local_cluster : {true, false}) { for (auto n2 : {200, 400}) { - for (auto transa : {'N', 'T'}) { - for (auto transb : {'N', 'T'}) { + for (auto transa : {'N', 'T', 'C'}) { + for (auto transb : {'N', 'T', 'C'}) { std::cout << "hmatrix product: " << use_local_cluster << " " << epsilon << " " << n1 << " " << n2 << " " << n3 << " " << transa << " " << transb << "\n"; - is_error = is_error || test_hmatrix_product(transa, transb, n1, n2, n3, use_local_cluster, epsilon, margin); + is_error = is_error || test_hmatrix_product, GeneratorTestComplex>(transa, transb, n1, n2, n3, use_local_cluster, epsilon, additional_lrmat_sum_tolerances); } } } } - - // TODO: fix 'C' operation, missing some conj operations somewhere - // for (auto operation : {'N', 'C'}) { - - // // Square matrix - // is_error = is_error || test_hmatrix_product, GeneratorTestComplexSymmetric>(operation, 'N', n1, n2, n3, 'N', 'N', 'N', use_local_cluster, epsilon, margin); - // is_error = is_error || test_hmatrix_product, GeneratorTestComplexHermitian>(operation, 'N', n1, n2, n3, 'L', 'H', 'L', use_local_cluster, epsilon, margin); - // is_error = is_error || test_hmatrix_product, GeneratorTestComplexHermitian>(operation, 'N', n1, n2, n3, 'L', 'H', 'U', use_local_cluster, epsilon, margin); - - // // Rectangle matrix - // is_error = is_error || test_hmatrix_product, GeneratorTestComplex>(operation, 'N', n1_increased, n2, n3, 'N', 'N', 'N', use_local_cluster, epsilon, margin); - // is_error = is_error || test_hmatrix_product, GeneratorTestComplex>(operation, 'N', n1, n2_increased, n3, 'N', 'N', 'N', use_local_cluster, epsilon, margin); - // } + for (auto side : {'L', 'R'}) { + for (auto UPLO : {'U', 'L'}) { + const double margin = 0; + std::cout << "symmetric matrix product: " << n1 << " " << n3 << " " << side << " " << UPLO << " " << epsilon << "\n"; + is_error = is_error || test_symmetric_hmatrix_product, GeneratorTestComplexSymmetric>(n1, n3, side, UPLO, epsilon, margin); + std::cout << "hermitian matrix product: " << n1 << " " << n3 << " " << side << " " << UPLO << " " << epsilon << "\n"; + is_error = is_error || test_hermitian_hmatrix_product, GeneratorTestComplexHermitian>(n1, n3, side, UPLO, epsilon, margin); + } + } } } } diff --git a/tests/functional_tests/hmatrix/hmatrix_product/test_hmatrix_product_double.cpp b/tests/functional_tests/hmatrix/hmatrix_product/test_hmatrix_product_double.cpp index da3a3d01..29563497 100644 --- a/tests/functional_tests/hmatrix/hmatrix_product/test_hmatrix_product_double.cpp +++ b/tests/functional_tests/hmatrix/hmatrix_product/test_hmatrix_product_double.cpp @@ -7,8 +7,8 @@ using namespace htool; int main(int argc, char *argv[]) { MPI_Init(&argc, &argv); - bool is_error = false; - const double margin = 10; + bool is_error = false; + const std::array additional_lrmat_sum_tolerances{10., 10., 10., 10., 10.}; for (auto epsilon : {1e-6, 1e-10}) { for (auto n1 : {200, 400}) { @@ -19,17 +19,18 @@ int main(int argc, char *argv[]) { for (auto transb : {'N', 'T'}) { std::cout << "hmatrix product: " << use_local_cluster << " " << epsilon << " " << n1 << " " << n2 << " " << n3 << " " << transa << " " << transb << "\n"; - is_error = is_error || test_hmatrix_product(transa, transb, n1, n2, n3, use_local_cluster, epsilon, margin); + is_error = is_error || test_hmatrix_product(transa, transb, n1, n2, n3, use_local_cluster, epsilon, additional_lrmat_sum_tolerances); } } } } - // for (auto side : {'L', 'R'}) { - // for (auto UPLO : {'U', 'L'}) { - // std::cout << "symmetric matrix product: " << n1 << " " << n3 << " " << side << " " << UPLO << " " << epsilon << "\n"; - // is_error = is_error || test_symmetric_hmatrix_product(n1, n3, side, UPLO, epsilon, margin); - // } - // } + for (auto side : {'L', 'R'}) { + for (auto UPLO : {'U', 'L'}) { + const double margin = 0; + std::cout << "symmetric matrix product: " << n1 << " " << n3 << " " << side << " " << UPLO << " " << epsilon << "\n"; + is_error = is_error || test_symmetric_hmatrix_product(n1, n3, side, UPLO, epsilon, margin); + } + } } } } diff --git a/tests/functional_tests/hmatrix/hmatrix_triangular_solve/test_hmatrix_triangular_solve_complex_double.cpp b/tests/functional_tests/hmatrix/hmatrix_triangular_solve/test_hmatrix_triangular_solve_complex_double.cpp index bf2a0d45..7eeb2cc6 100644 --- a/tests/functional_tests/hmatrix/hmatrix_triangular_solve/test_hmatrix_triangular_solve_complex_double.cpp +++ b/tests/functional_tests/hmatrix/hmatrix_triangular_solve/test_hmatrix_triangular_solve_complex_double.cpp @@ -16,8 +16,7 @@ int main(int, char *[]) { for (auto number_of_rhs : {100}) { for (auto epsilon : {1e-6, 1e-10}) { for (auto side : {'L', 'R'}) { - // TODO: add 'C' operation when hmatrix product is checked - for (auto operation : {'N', 'T'}) { + for (auto operation : {'N', 'T', 'C'}) { std::cout << epsilon << " " << number_of_rhs << " " << side << " " << operation << "\n"; is_error = is_error || test_hmatrix_triangular_solve, GeneratorTestComplexHermitian>(side, operation, n1, number_of_rhs, epsilon, margin); } diff --git a/tests/functional_tests/hmatrix/lrmat/lrmat_product/CMakeLists.txt b/tests/functional_tests/hmatrix/lrmat/lrmat_product/CMakeLists.txt index 313d9ae0..7b5dad54 100755 --- a/tests/functional_tests/hmatrix/lrmat/lrmat_product/CMakeLists.txt +++ b/tests/functional_tests/hmatrix/lrmat/lrmat_product/CMakeLists.txt @@ -3,13 +3,12 @@ #=============================================================================# #=== lrmat_SVD -set(compressions "fullACA") -list(APPEND compressions "partialACA") -list(APPEND compressions "sympartialACA") -list(APPEND compressions "SVD") -foreach(compression ${compressions}) - add_executable(Test_lrmat_product_${compression} test_lrmat_product_${compression}.cpp) - target_link_libraries(Test_lrmat_product_${compression} htool) - add_dependencies(build-tests-lrmat-product Test_lrmat_product_${compression}) - add_test(Test_lrmat_product_${compression} Test_lrmat_product_${compression}) +set(types "double") +list(APPEND types "complex_double") + +foreach(type ${types}) + add_executable(Test_lrmat_product_${type} test_lrmat_product_${type}.cpp) + target_link_libraries(Test_lrmat_product_${type} htool) + add_dependencies(build-tests-lrmat-product Test_lrmat_product_${type}) + add_test(Test_lrmat_product_${type} Test_lrmat_product_${type}) endforeach() diff --git a/tests/functional_tests/hmatrix/lrmat/lrmat_product/test_lrmat_product_complex_double.cpp b/tests/functional_tests/hmatrix/lrmat/lrmat_product/test_lrmat_product_complex_double.cpp new file mode 100644 index 00000000..ac1b533e --- /dev/null +++ b/tests/functional_tests/hmatrix/lrmat/lrmat_product/test_lrmat_product_complex_double.cpp @@ -0,0 +1,32 @@ +#include "../test_lrmat_product.hpp" +#include + +using namespace std; +using namespace htool; + +int main(int, char *[]) { + + bool is_error = false; + const double additional_compression_tolerance = 0; + const std::array additional_lrmat_sum_tolerances{10., 10., 10., 10.}; + + for (auto epsilon : {1e-6, 1e-10}) { + for (auto n1 : {200, 400}) { + for (auto n3 : {100}) { + for (auto n2 : {200, 400}) { + for (auto transa : {'N', 'T', 'C'}) { + for (auto transb : {'N', 'T', 'C'}) { + std::cout << epsilon << " " << n1 << " " << n2 << " " << n3 << " " << transa << " " << transb << "\n"; + is_error = is_error || test_lrmat_product, GeneratorTestComplex, SVD>>(transa, transb, n1, n2, n3, epsilon, additional_compression_tolerance, additional_lrmat_sum_tolerances); + } + } + } + } + } + } + + if (is_error) { + return 1; + } + return 0; +} diff --git a/tests/functional_tests/hmatrix/lrmat/lrmat_product/test_lrmat_product_SVD.cpp b/tests/functional_tests/hmatrix/lrmat/lrmat_product/test_lrmat_product_double.cpp similarity index 67% rename from tests/functional_tests/hmatrix/lrmat/lrmat_product/test_lrmat_product_SVD.cpp rename to tests/functional_tests/hmatrix/lrmat/lrmat_product/test_lrmat_product_double.cpp index ba38e04d..c26248f5 100644 --- a/tests/functional_tests/hmatrix/lrmat/lrmat_product/test_lrmat_product_SVD.cpp +++ b/tests/functional_tests/hmatrix/lrmat/lrmat_product/test_lrmat_product_double.cpp @@ -20,12 +20,6 @@ int main(int, char *[]) { is_error = is_error || test_lrmat_product>(transa, transb, n1, n2, n3, epsilon, additional_compression_tolerance, additional_lrmat_sum_tolerances); } } - for (auto transa : {'N', 'T'}) { - for (auto transb : {'N', 'T'}) { - std::cout << epsilon << " " << n1 << " " << n2 << " " << n3 << " " << transa << " " << transb << "\n"; - is_error = is_error || test_lrmat_product, GeneratorTestComplex, SVD>>(transa, transb, n1, n2, n3, epsilon, additional_compression_tolerance, additional_lrmat_sum_tolerances); - } - } } } } diff --git a/tests/functional_tests/hmatrix/lrmat/lrmat_product/test_lrmat_product_fullACA.cpp b/tests/functional_tests/hmatrix/lrmat/lrmat_product/test_lrmat_product_fullACA.cpp deleted file mode 100644 index 37bcf8f2..00000000 --- a/tests/functional_tests/hmatrix/lrmat/lrmat_product/test_lrmat_product_fullACA.cpp +++ /dev/null @@ -1,38 +0,0 @@ -#include "../test_lrmat_product.hpp" -#include - -using namespace std; -using namespace htool; - -int main(int, char *[]) { - - bool is_error = false; - const double additional_compression_tolerance = 0; - const std::array additional_lrmat_sum_tolerances{1., 1., 1., 1.}; - - for (auto epsilon : {1e-6, 1e-10}) { - for (auto n1 : {200, 400}) { - for (auto n3 : {100}) { - for (auto n2 : {200, 400}) { - for (auto transa : {'N', 'T'}) { - for (auto transb : {'N', 'T'}) { - std::cout << epsilon << " " << n1 << " " << n2 << " " << n3 << " " << transa << " " << transb << "\n"; - is_error = is_error || test_lrmat_product>(transa, transb, n1, n2, n3, epsilon, additional_compression_tolerance, additional_lrmat_sum_tolerances); - } - } - for (auto transa : {'N', 'T'}) { - for (auto transb : {'N', 'T'}) { - std::cout << epsilon << " " << n1 << " " << n2 << " " << n3 << " " << transa << " " << transb << "\n"; - is_error = is_error || test_lrmat_product, GeneratorTestComplex, fullACA>>(transa, transb, n1, n2, n3, epsilon, additional_compression_tolerance, additional_lrmat_sum_tolerances); - } - } - } - } - } - } - - if (is_error) { - return 1; - } - return 0; -} diff --git a/tests/functional_tests/hmatrix/lrmat/lrmat_product/test_lrmat_product_partialACA.cpp b/tests/functional_tests/hmatrix/lrmat/lrmat_product/test_lrmat_product_partialACA.cpp deleted file mode 100644 index 29d897b0..00000000 --- a/tests/functional_tests/hmatrix/lrmat/lrmat_product/test_lrmat_product_partialACA.cpp +++ /dev/null @@ -1,37 +0,0 @@ -#include "../test_lrmat_product.hpp" -#include - -using namespace std; -using namespace htool; - -int main(int, char *[]) { - - bool is_error = false; - const double additional_compression_tolerance = 0; - const std::array additional_lrmat_sum_tolerances{1., 1., 1., 1.}; - - for (auto epsilon : {1e-6, 1e-10}) { - for (auto n1 : {200, 400}) { - for (auto n3 : {100}) { - for (auto n2 : {200, 400}) { - for (auto transa : {'N', 'T'}) { - for (auto transb : {'N', 'T'}) { - std::cout << epsilon << " " << n1 << " " << n2 << " " << n3 << " " << transa << " " << transb << "\n"; - is_error = is_error || test_lrmat_product>(transa, transb, n1, n2, n3, epsilon, additional_compression_tolerance, additional_lrmat_sum_tolerances); - } - } - for (auto transa : {'N', 'T'}) { - for (auto transb : {'N', 'T'}) { - std::cout << epsilon << " " << n1 << " " << n2 << " " << n3 << " " << transa << " " << transb << "\n"; - is_error = is_error || test_lrmat_product, GeneratorTestComplex, partialACA>>(transa, transb, n1, n2, n3, epsilon, additional_compression_tolerance, additional_lrmat_sum_tolerances); - } - } - } - } - } - } - if (is_error) { - return 1; - } - return 0; -} diff --git a/tests/functional_tests/hmatrix/lrmat/lrmat_product/test_lrmat_product_sympartialACA.cpp b/tests/functional_tests/hmatrix/lrmat/lrmat_product/test_lrmat_product_sympartialACA.cpp deleted file mode 100644 index 0faea38e..00000000 --- a/tests/functional_tests/hmatrix/lrmat/lrmat_product/test_lrmat_product_sympartialACA.cpp +++ /dev/null @@ -1,37 +0,0 @@ -#include "../test_lrmat_product.hpp" -#include - -using namespace std; -using namespace htool; - -int main(int, char *[]) { - - bool is_error = false; - const double additional_compression_tolerance = 0; - const std::array additional_lrmat_sum_tolerances{1., 1., 1., 1.}; - - for (auto epsilon : {1e-6, 1e-10}) { - for (auto n1 : {200, 400}) { - for (auto n3 : {100}) { - for (auto n2 : {200, 400}) { - for (auto transa : {'N', 'T'}) { - for (auto transb : {'N', 'T'}) { - std::cout << epsilon << " " << n1 << " " << n2 << " " << n3 << " " << transa << " " << transb << "\n"; - is_error = is_error || test_lrmat_product>(transa, transb, n1, n2, n3, epsilon, additional_compression_tolerance, additional_lrmat_sum_tolerances); - } - } - for (auto transa : {'N', 'T'}) { - for (auto transb : {'N', 'T'}) { - std::cout << epsilon << " " << n1 << " " << n2 << " " << n3 << " " << transa << " " << transb << "\n"; - is_error = is_error || test_lrmat_product, GeneratorTestComplex, sympartialACA>>(transa, transb, n1, n2, n3, epsilon, additional_compression_tolerance, additional_lrmat_sum_tolerances); - } - } - } - } - } - } - if (is_error) { - return 1; - } - return 0; -} diff --git a/tests/functional_tests/hmatrix/test_hmatrix_factorization.hpp b/tests/functional_tests/hmatrix/test_hmatrix_factorization.hpp index 9380dd55..9b7fb8a5 100644 --- a/tests/functional_tests/hmatrix/test_hmatrix_factorization.hpp +++ b/tests/functional_tests/hmatrix/test_hmatrix_factorization.hpp @@ -65,7 +65,11 @@ bool test_hmatrix_cholesky(char UPLO, int n1, int n2, htool::underlying_type Matrix A_dense(no_A, ni_A), X_dense(no_X, ni_X), B_dense(X_dense), densified_hmatrix_test(B_dense), matrix_test; test_case.operator_A->copy_submatrix(no_A, ni_A, test_case.root_cluster_A_output->get_offset(), test_case.root_cluster_A_input->get_offset(), A_dense.data()); generate_random_matrix(X_dense); - add_symmetric_matrix_matrix_product('L', UPLO, T(1.), A_dense, X_dense, T(0.), B_dense); + if (is_complex()) { + add_hermitian_matrix_matrix_product('L', UPLO, T(1.), A_dense, X_dense, T(0.), B_dense); + } else { + add_symmetric_matrix_matrix_product('L', UPLO, T(1.), A_dense, X_dense, T(0.), B_dense); + } // Cholesky factorization matrix_test = B_dense; diff --git a/tests/functional_tests/hmatrix/test_hmatrix_lrmat_product.hpp b/tests/functional_tests/hmatrix/test_hmatrix_lrmat_product.hpp index e6cb8119..a899937b 100644 --- a/tests/functional_tests/hmatrix/test_hmatrix_lrmat_product.hpp +++ b/tests/functional_tests/hmatrix/test_hmatrix_lrmat_product.hpp @@ -16,7 +16,7 @@ using namespace std; using namespace htool; template -bool test_hmatrix_lrmat_product(const TestCaseProduct &test_case, bool use_local_cluster, htool::underlying_type epsilon, htool::underlying_type) { +bool test_hmatrix_lrmat_product(const TestCaseProduct &test_case, bool use_local_cluster, htool::underlying_type epsilon, htool::underlying_type additional_lrmat_sum_tolerance) { int rankWorld; MPI_Comm_rank(MPI_COMM_WORLD, &rankWorld); @@ -125,7 +125,7 @@ bool test_hmatrix_lrmat_product(const TestCaseProduct &tes dense_lrmat_test.resize(lrmat_test.get_U().nb_rows(), lrmat_test.get_V().nb_cols()); lrmat_test.copy_to_dense(dense_lrmat_test.data()); error = normFrob(matrix_result_w_lrmat_sum - dense_lrmat_test) / normFrob(matrix_result_w_lrmat_sum); - is_error = is_error || !(error < std::max(epsilon, lrmat_tolerance)); + is_error = is_error || !(error < std::max(epsilon, lrmat_tolerance) * (1 + additional_lrmat_sum_tolerance)); cout << "> Errors on a hmatrix lrmat product to lrmat with lrmat sum: " << error << endl; hmatrix_test = C; @@ -140,7 +140,7 @@ bool test_hmatrix_lrmat_product(const TestCaseProduct &tes add_hmatrix_lrmat_product(transa, transb, alpha, root_hmatrix, B_auto_approximation, beta, hmatrix_test); copy_to_dense(hmatrix_test, densified_hmatrix_test.data()); error = normFrob(matrix_result_w_matrix_sum - densified_hmatrix_test) / normFrob(matrix_result_w_matrix_sum); - is_error = is_error || !(error < std::max(epsilon, lrmat_tolerance)); + is_error = is_error || !(error < std::max(epsilon, lrmat_tolerance) * (1 + additional_lrmat_sum_tolerance)); cout << "> Errors on a hmatrix lrmat product to hmatrix with lrmat sum: " << error << endl; cout << "> is_error: " << is_error << endl; diff --git a/tests/functional_tests/hmatrix/test_hmatrix_matrix_product.hpp b/tests/functional_tests/hmatrix/test_hmatrix_matrix_product.hpp index 9070fd18..6c25d46d 100644 --- a/tests/functional_tests/hmatrix/test_hmatrix_matrix_product.hpp +++ b/tests/functional_tests/hmatrix/test_hmatrix_matrix_product.hpp @@ -15,7 +15,7 @@ using namespace std; using namespace htool; template -bool test_hmatrix_matrix_product(const TestCaseProduct &test_case, bool use_local_cluster, htool::underlying_type epsilon, htool::underlying_type margin) { +bool test_hmatrix_matrix_product(const TestCaseProduct &test_case, bool use_local_cluster, htool::underlying_type epsilon, htool::underlying_type additional_lrmat_sum_tolerance) { int rankWorld; MPI_Comm_rank(MPI_COMM_WORLD, &rankWorld); @@ -109,6 +109,12 @@ bool test_hmatrix_matrix_product(const TestCaseProduct &te error = norm2(matrix_result_w_matrix_sum.get_col(0) - test_vec) / norm2(matrix_result_w_matrix_sum.get_col(0)); is_error = is_error || !(error < epsilon); cout << "> Errors on a hmatrix vector product: " << error << endl; + + test_vec = C_vec; + openmp_add_hmatrix_vector_product(transa, alpha, root_hmatrix, B_vec.data(), beta, test_vec.data()); + error = norm2(matrix_result_w_matrix_sum.get_col(0) - test_vec) / norm2(matrix_result_w_matrix_sum.get_col(0)); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a openmp hmatrix vector product: " << error << endl; } matrix_test = C_dense; @@ -145,7 +151,7 @@ bool test_hmatrix_matrix_product(const TestCaseProduct &te dense_lrmat_test.resize(lrmat_test.get_U().nb_rows(), lrmat_test.get_V().nb_cols()); lrmat_test.copy_to_dense(dense_lrmat_test.data()); error = normFrob(matrix_result_w_lrmat_sum - dense_lrmat_test) / normFrob(matrix_result_w_lrmat_sum); - is_error = is_error || !(error < std::max(epsilon, lrmat_tolerance) * margin); + is_error = is_error || !(error < std::max(epsilon, lrmat_tolerance) * (1 + additional_lrmat_sum_tolerance)); cout << "> Errors on a hmatrix matrix product to lrmat with sum: " << error << endl; // matrix_test = C_dense; @@ -169,12 +175,12 @@ bool test_symmetric_hmatrix_matrix_product(const TestCaseSymmetricProduct> *root_cluster_A_output, *root_cluster_A_input, *root_cluster_B_output, *root_cluster_B_input, *root_cluster_C_output, *root_cluster_C_input; - root_cluster_A_output = test_case.root_cluster_A_output; - root_cluster_A_input = test_case.root_cluster_A_input; - root_cluster_B_output = test_case.root_cluster_B_output; - root_cluster_B_input = test_case.root_cluster_B_input; - root_cluster_C_output = test_case.root_cluster_C_output; - root_cluster_C_input = test_case.root_cluster_C_input; + root_cluster_A_output = &test_case.root_cluster_A_output->get_cluster_on_partition(rankWorld); + root_cluster_A_input = &test_case.root_cluster_A_input->get_cluster_on_partition(rankWorld); + root_cluster_B_output = &test_case.root_cluster_B_output->get_cluster_on_partition(rankWorld); + root_cluster_B_input = &test_case.root_cluster_B_input->get_cluster_on_partition(rankWorld); + root_cluster_C_output = &test_case.root_cluster_C_output->get_cluster_on_partition(rankWorld); + root_cluster_C_input = &test_case.root_cluster_C_input->get_cluster_on_partition(rankWorld); HMatrixTreeBuilder> hmatrix_tree_builder(*root_cluster_A_output, *root_cluster_A_input, epsilon, eta, 'S', UPLO, -1, -1, rankWorld); hmatrix_tree_builder.set_low_rank_generator(std::make_shared>()); @@ -220,10 +226,15 @@ bool test_symmetric_hmatrix_matrix_product(const TestCaseSymmetricProduct matrix_result_w_matrix_sum(C_dense), transposed_matrix_result_w_matrix_sum(C_dense.nb_cols(), C_dense.nb_rows()), matrix_result_wo_sum(C_dense), matrix_result_w_lrmat_sum(C_dense); C_auto_approximation.copy_to_dense(matrix_result_w_lrmat_sum.data()); + // if (is_complex()) { add_symmetric_matrix_matrix_product(side, UPLO, alpha, A_dense, B_dense, beta, matrix_result_w_matrix_sum); transpose(matrix_result_w_matrix_sum, transposed_matrix_result_w_matrix_sum); - add_symmetric_matrix_matrix_product(side, UPLO, alpha, A_dense, B_dense, T(0), matrix_result_wo_sum); - add_symmetric_matrix_matrix_product(side, UPLO, alpha, HA_dense, B_dense, beta, matrix_result_w_lrmat_sum); + // add_symmetric_matrix_matrix_product(side, UPLO, alpha, A_dense, B_dense, T(0), matrix_result_wo_sum); + // add_symmetric_matrix_matrix_product(side, UPLO, alpha, HA_dense, B_dense, beta, matrix_result_w_lrmat_sum); + // } else { + // add_hermitian_matrix_matrix_product(side, UPLO, alpha, A_dense, B_dense, beta, matrix_result_w_matrix_sum); + // transpose(matrix_result_w_matrix_sum, transposed_matrix_result_w_matrix_sum); + // } Matrix scaled_matrix_result_w_matrix_sum(matrix_result_w_matrix_sum); scale(scaling_coefficient, scaled_matrix_result_w_matrix_sum); @@ -237,42 +248,145 @@ bool test_symmetric_hmatrix_matrix_product(const TestCaseSymmetricProduct Errors on a symmetric hmatrix vector product: " << error << endl; // } - // matrix_test = C_dense; - // add_symmetric_hmatrix_matrix_product(side, UPLO, alpha, root_hmatrix, B_dense, beta, matrix_test); - // error = normFrob(matrix_result_w_matrix_sum - matrix_test) / normFrob(matrix_result_w_matrix_sum); - // is_error = is_error || !(error < epsilon); - // cout << "> Errors on a symmetric hmatrix matrix product: " << error << endl; - if (side == 'L') { - // matrix_test = transposed_C_dense; - // openmp_add_hmatrix_matrix_product_row_major(transa, transb, alpha, root_hmatrix, transposed_B_dense.data(), beta, matrix_test.data(), C_dense.nb_cols()); - // error = normFrob(transposed_matrix_result_w_matrix_sum - matrix_test) / normFrob(transposed_matrix_result_w_matrix_sum); - // is_error = is_error || !(error < epsilon); - // cout << "> Errors on a openmp symmetric hmatrix matrix product with row major input: " << error << endl; + matrix_test = C_dense; + add_hmatrix_matrix_product('N', 'N', alpha, root_hmatrix, B_dense, beta, matrix_test); + error = normFrob(matrix_result_w_matrix_sum - matrix_test) / normFrob(matrix_result_w_matrix_sum); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a symmetric hmatrix matrix product: " << error << endl; matrix_test = transposed_C_dense; - sequential_add_symmetric_hmatrix_matrix_product_row_major(side, UPLO, alpha, root_hmatrix, transposed_B_dense.data(), beta, matrix_test.data(), C_dense.nb_cols()); + sequential_add_hmatrix_matrix_product_row_major('N', 'N', alpha, root_hmatrix, transposed_B_dense.data(), beta, matrix_test.data(), C_dense.nb_cols()); error = normFrob(transposed_matrix_result_w_matrix_sum - matrix_test) / normFrob(transposed_matrix_result_w_matrix_sum); is_error = is_error || !(error < epsilon); cout << "> Errors on a sequential symmetric hmatrix matrix product with row major input: " << error << endl; + + matrix_test = transposed_C_dense; + openmp_add_hmatrix_matrix_product_row_major('N', 'N', alpha, root_hmatrix, transposed_B_dense.data(), beta, matrix_test.data(), C_dense.nb_cols()); + error = normFrob(transposed_matrix_result_w_matrix_sum - matrix_test) / normFrob(transposed_matrix_result_w_matrix_sum); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a openmp symmetric hmatrix matrix product with row major input: " << error << endl; + } else { + matrix_test = C_dense; + add_matrix_hmatrix_product('N', 'N', alpha, B_dense, root_hmatrix, beta, matrix_test); + error = normFrob(matrix_result_w_matrix_sum - matrix_test) / normFrob(matrix_result_w_matrix_sum); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a matrix symmetric hmatrix product: " << error << endl; } - // // Product - // lrmat_test = C_auto_approximation; - // add_symmetric_hmatrix_matrix_product(side, UPLO, alpha, root_hmatrix, B_dense, T(0), lrmat_test); - // dense_lrmat_test.resize(lrmat_test.get_U().nb_rows(), lrmat_test.get_V().nb_cols()); - // lrmat_test.copy_to_dense(dense_lrmat_test.data()); - // error = normFrob(matrix_result_wo_sum - dense_lrmat_test) / normFrob(matrix_result_wo_sum); - // is_error = is_error || !(error < std::max(epsilon, lrmat_tolerance)); - // cout << "> Errors on a hmatrix matrix product to lrmat without sum: " << error << endl; - - // lrmat_test = C_auto_approximation; - // add_symmetric_hmatrix_matrix_product(transa, transb, alpha, root_hmatrix, B_dense, beta, lrmat_test); - // dense_lrmat_test.resize(lrmat_test.get_U().nb_rows(), lrmat_test.get_V().nb_cols()); - // lrmat_test.copy_to_dense(dense_lrmat_test.data()); - // error = normFrob(matrix_result_w_lrmat_sum - dense_lrmat_test) / normFrob(matrix_result_w_lrmat_sum); - // is_error = is_error || !(error < std::max(epsilon, lrmat_tolerance) * margin); - // cout << "> Errors on a symmetric_hmatrix matrix product to lrmat with sum: " << error << endl; + return is_error; +} + +template +bool test_hermitian_hmatrix_matrix_product(const TestCaseSymmetricProduct &test_case, htool::underlying_type epsilon, htool::underlying_type) { + + int rankWorld; + MPI_Comm_rank(MPI_COMM_WORLD, &rankWorld); + bool is_error = false; + double eta = 10; + char side = test_case.side; + char UPLO = test_case.UPLO; + + const Cluster> *root_cluster_A_output, *root_cluster_A_input, *root_cluster_B_output, *root_cluster_B_input, *root_cluster_C_output, *root_cluster_C_input; + root_cluster_A_output = &test_case.root_cluster_A_output->get_cluster_on_partition(rankWorld); + root_cluster_A_input = &test_case.root_cluster_A_input->get_cluster_on_partition(rankWorld); + root_cluster_B_output = &test_case.root_cluster_B_output->get_cluster_on_partition(rankWorld); + root_cluster_B_input = &test_case.root_cluster_B_input->get_cluster_on_partition(rankWorld); + root_cluster_C_output = &test_case.root_cluster_C_output->get_cluster_on_partition(rankWorld); + root_cluster_C_input = &test_case.root_cluster_C_input->get_cluster_on_partition(rankWorld); + + HMatrixTreeBuilder> hmatrix_tree_builder(*root_cluster_A_output, *root_cluster_A_input, epsilon, eta, 'H', UPLO, -1, -1, rankWorld); + hmatrix_tree_builder.set_low_rank_generator(std::make_shared>()); + + // build + HMatrix> root_hmatrix = hmatrix_tree_builder.build(*test_case.operator_A); + + // Dense matrix + int ni_A = root_cluster_A_input->get_size(); + int no_A = root_cluster_A_output->get_size(); + int ni_B = root_cluster_B_input->get_size(); + int no_B = root_cluster_B_output->get_size(); + int ni_C = root_cluster_C_input->get_size(); + int no_C = root_cluster_C_output->get_size(); + + Matrix A_dense(no_A, ni_A), B_dense(no_B, ni_B), C_dense(no_C, ni_C); + test_case.operator_A->copy_submatrix(no_A, ni_A, root_cluster_A_output->get_offset(), root_cluster_A_input->get_offset(), A_dense.data()); + test_case.operator_B->copy_submatrix(no_B, ni_B, root_cluster_B_output->get_offset(), root_cluster_B_input->get_offset(), B_dense.data()); + test_case.operator_C->copy_submatrix(no_C, ni_C, root_cluster_C_output->get_offset(), root_cluster_C_input->get_offset(), C_dense.data()); + Matrix matrix_test, dense_lrmat_test, transposed_B_dense(B_dense.nb_cols(), B_dense.nb_rows()), transposed_C_dense(C_dense.nb_cols(), C_dense.nb_rows()); + transpose(B_dense, transposed_B_dense); + transpose(C_dense, transposed_C_dense); + + Matrix HA_dense(root_hmatrix.get_target_cluster().get_size(), root_hmatrix.get_source_cluster().get_size()); + copy_to_dense(root_hmatrix, HA_dense.data()); + + // // lrmat + // SVD compressor; + // htool::underlying_type lrmat_tolerance = 0.0001; + // LowRankMatrix C_auto_approximation(*test_case.operator_C, compressor, *root_cluster_C_output, *root_cluster_C_input, -1, lrmat_tolerance), lrmat_test(lrmat_tolerance); + + // Random Input matrix + std::vector B_vec, C_vec, test_vec; + B_vec = B_dense.get_col(0); + C_vec = C_dense.get_col(0); + T alpha(0), beta(0), scaling_coefficient; + htool::underlying_type error; + generate_random_scalar(alpha); + generate_random_scalar(beta); + generate_random_scalar(scaling_coefficient); + + // References + Matrix matrix_result_w_matrix_sum(C_dense), transposed_matrix_result_w_matrix_sum(C_dense.nb_cols(), C_dense.nb_rows()), matrix_result_wo_sum(C_dense), matrix_result_w_lrmat_sum(C_dense); + // C_auto_approximation.copy_to_dense(matrix_result_w_lrmat_sum.data()); + + // if (is_complex()) { + add_hermitian_matrix_matrix_product(side, UPLO, alpha, A_dense, B_dense, beta, matrix_result_w_matrix_sum); + transpose(matrix_result_w_matrix_sum, transposed_matrix_result_w_matrix_sum); + // add_symmetric_matrix_matrix_product(side, UPLO, alpha, A_dense, B_dense, T(0), matrix_result_wo_sum); + // add_symmetric_matrix_matrix_product(side, UPLO, alpha, HA_dense, B_dense, beta, matrix_result_w_lrmat_sum); + // } else { + // add_hermitian_matrix_matrix_product(side, UPLO, alpha, A_dense, B_dense, beta, matrix_result_w_matrix_sum); + // transpose(matrix_result_w_matrix_sum, transposed_matrix_result_w_matrix_sum); + // } + + Matrix scaled_matrix_result_w_matrix_sum(matrix_result_w_matrix_sum); + scale(scaling_coefficient, scaled_matrix_result_w_matrix_sum); + + // Product + // if ((side == 'L' && ni_B == 1) || (side == 'R' && no_B == 1)) { + // test_vec = C_vec; + // add_symmetric_hmatrix_vector_product(side, alpha, root_hmatrix, B_vec.data(), beta, test_vec.data()); + // error = norm2(matrix_result_w_matrix_sum.get_col(0) - test_vec) / norm2(matrix_result_w_matrix_sum.get_col(0)); + // is_error = is_error || !(error < epsilon); + // cout << "> Errors on a symmetric hmatrix vector product: " << error << endl; + // } + + if (side == 'L') { + matrix_test = C_dense; + add_hmatrix_matrix_product('N', 'N', alpha, root_hmatrix, B_dense, beta, matrix_test); + error = normFrob(matrix_result_w_matrix_sum - matrix_test) / normFrob(matrix_result_w_matrix_sum); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a hermitian hmatrix matrix product: " << error << endl; + + matrix_test = transposed_C_dense; + sequential_add_hmatrix_matrix_product_row_major('N', 'N', alpha, root_hmatrix, transposed_B_dense.data(), beta, matrix_test.data(), C_dense.nb_cols()); + error = normFrob(transposed_matrix_result_w_matrix_sum - matrix_test) / normFrob(transposed_matrix_result_w_matrix_sum); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a sequential hermitian hmatrix matrix product with row major input: " << error << endl; + + matrix_test = transposed_C_dense; + openmp_add_hmatrix_matrix_product_row_major('N', 'N', alpha, root_hmatrix, transposed_B_dense.data(), beta, matrix_test.data(), C_dense.nb_cols()); + error = normFrob(transposed_matrix_result_w_matrix_sum - matrix_test) / normFrob(transposed_matrix_result_w_matrix_sum); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a openmp hermitian hmatrix matrix product with row major input: " << error << endl; + } else { + matrix_test = C_dense; + // std::cout << "??? " << root_hmatrix.get_symmetry_for_leaves() << "\n"; + add_matrix_hmatrix_product('N', 'N', alpha, B_dense, root_hmatrix, beta, matrix_test); + error = normFrob(matrix_result_w_matrix_sum - matrix_test) / normFrob(matrix_result_w_matrix_sum); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a matrix hermitian hmatrix product: " << error << endl; + } return is_error; } diff --git a/tests/functional_tests/hmatrix/test_hmatrix_product.hpp b/tests/functional_tests/hmatrix/test_hmatrix_product.hpp index b720ace0..2120d76f 100644 --- a/tests/functional_tests/hmatrix/test_hmatrix_product.hpp +++ b/tests/functional_tests/hmatrix/test_hmatrix_product.hpp @@ -18,7 +18,7 @@ using namespace std; using namespace htool; template -bool test_hmatrix_product(char transa, char transb, int n1, int n2, int n3, bool use_local_cluster, htool::underlying_type epsilon, htool::underlying_type margin) { +bool test_hmatrix_product(char transa, char transb, int n1, int n2, int n3, bool use_local_cluster, htool::underlying_type epsilon, std::array, 5> additional_lrmat_sum_tolerances) { // Get the number of processes int sizeWorld; MPI_Comm_size(MPI_COMM_WORLD, &sizeWorld); @@ -30,11 +30,11 @@ bool test_hmatrix_product(char transa, char transb, int n1, int n2, int n3, bool bool is_error = false; TestCaseProduct test_case(transa, transb, n1, n2, n3, 1, 2, sizeWorld); - is_error = test_hmatrix_matrix_product(test_case, use_local_cluster, epsilon, margin); - is_error = test_matrix_hmatrix_product(test_case, use_local_cluster, epsilon, margin); - is_error = test_hmatrix_lrmat_product(test_case, use_local_cluster, epsilon, margin); - is_error = test_lrmat_hmatrix_product(test_case, use_local_cluster, epsilon, margin); - is_error = test_hmatrix_hmatrix_product(test_case, use_local_cluster, epsilon, margin); + is_error = is_error || test_hmatrix_matrix_product(test_case, use_local_cluster, epsilon, additional_lrmat_sum_tolerances[0]); + is_error = is_error || test_matrix_hmatrix_product(test_case, use_local_cluster, epsilon, additional_lrmat_sum_tolerances[1]); + is_error = is_error || test_hmatrix_lrmat_product(test_case, use_local_cluster, epsilon, additional_lrmat_sum_tolerances[2]); + is_error = is_error || test_lrmat_hmatrix_product(test_case, use_local_cluster, epsilon, additional_lrmat_sum_tolerances[3]); + is_error = is_error || test_hmatrix_hmatrix_product(test_case, use_local_cluster, epsilon, additional_lrmat_sum_tolerances[4]); return is_error; } @@ -59,3 +59,25 @@ bool test_symmetric_hmatrix_product(int n1, int n2, char side, char UPLO, htool: // } return is_error; } + +template +bool test_hermitian_hmatrix_product(int n1, int n2, char side, char UPLO, htool::underlying_type epsilon, htool::underlying_type margin) { + // Get the number of processes + int sizeWorld; + MPI_Comm_size(MPI_COMM_WORLD, &sizeWorld); + + // Get the rankWorld of the process + int rankWorld; + MPI_Comm_rank(MPI_COMM_WORLD, &rankWorld); + + bool is_error = false; + TestCaseSymmetricProduct test_case(n1, n2, 2, side, 'H', UPLO, sizeWorld); + + is_error = test_hermitian_hmatrix_matrix_product(test_case, epsilon, margin); + // is_error = test_symmetric_hmatrix_lrmat_product(test_case, use_local_cluster, epsilon); + // is_error = test_symmetric_hmatrix_hmatrix_product(test_case, use_local_cluster, epsilon, margin); + // if (!(side == 'L' && Symmetry != 'N')) { + // is_error = test_lrmat_hmatrix_product(test_case, use_local_cluster, epsilon); + // } + return is_error; +} diff --git a/tests/functional_tests/hmatrix/test_lrmat_hmatrix_product.hpp b/tests/functional_tests/hmatrix/test_lrmat_hmatrix_product.hpp index bffe07cb..487c3d5b 100644 --- a/tests/functional_tests/hmatrix/test_lrmat_hmatrix_product.hpp +++ b/tests/functional_tests/hmatrix/test_lrmat_hmatrix_product.hpp @@ -13,7 +13,7 @@ using namespace std; using namespace htool; template -bool test_lrmat_hmatrix_product(const TestCaseProduct &test_case, bool use_local_cluster, htool::underlying_type epsilon, htool::underlying_type) { +bool test_lrmat_hmatrix_product(const TestCaseProduct &test_case, bool use_local_cluster, htool::underlying_type epsilon, htool::underlying_type additional_lrmat_sum_tolerance) { int rankWorld; MPI_Comm_rank(MPI_COMM_WORLD, &rankWorld); @@ -115,7 +115,7 @@ bool test_lrmat_hmatrix_product(const TestCaseProduct &tes dense_lrmat_test.resize(lrmat_test.get_U().nb_rows(), lrmat_test.get_V().nb_cols()); lrmat_test.copy_to_dense(dense_lrmat_test.data()); error = normFrob(matrix_result_w_lrmat_sum - dense_lrmat_test) / normFrob(matrix_result_w_lrmat_sum); - is_error = is_error || !(error < std::max(epsilon, lrmat_tolerance)); + is_error = is_error || !(error < std::max(epsilon, lrmat_tolerance) * (1 + additional_lrmat_sum_tolerance)); cout << "> Errors on a lrmat hmatrix product to lrmat with lrmat sum: " << error << endl; hmatrix_test = C; @@ -129,7 +129,7 @@ bool test_lrmat_hmatrix_product(const TestCaseProduct &tes add_lrmat_hmatrix_product(transa, transb, alpha, A_auto_approximation, root_hmatrix, beta, hmatrix_test); copy_to_dense(hmatrix_test, densified_hmatrix_test.data()); error = normFrob(matrix_result_w_matrix_sum - densified_hmatrix_test) / normFrob(matrix_result_w_matrix_sum); - is_error = is_error || !(error < std::max(epsilon, lrmat_tolerance)); + is_error = is_error || !(error < std::max(epsilon, lrmat_tolerance) * (1 + additional_lrmat_sum_tolerance)); cout << "> Errors on a lrmat hmatrix product to hmatrix with lrmat sum: " << error << endl; cout << "> is_error: " << is_error << endl; diff --git a/tests/functional_tests/hmatrix/test_matrix_hmatrix_product.hpp b/tests/functional_tests/hmatrix/test_matrix_hmatrix_product.hpp index 36764ea8..980e90e2 100644 --- a/tests/functional_tests/hmatrix/test_matrix_hmatrix_product.hpp +++ b/tests/functional_tests/hmatrix/test_matrix_hmatrix_product.hpp @@ -14,7 +14,7 @@ using namespace std; using namespace htool; template -bool test_matrix_hmatrix_product(const TestCaseProduct &test_case, bool use_local_cluster, htool::underlying_type epsilon, htool::underlying_type margin) { +bool test_matrix_hmatrix_product(const TestCaseProduct &test_case, bool use_local_cluster, htool::underlying_type epsilon, htool::underlying_type additional_lrmat_sum_tolerance) { int rankWorld; MPI_Comm_rank(MPI_COMM_WORLD, &rankWorld); @@ -113,7 +113,7 @@ bool test_matrix_hmatrix_product(const TestCaseProduct &te dense_lrmat_test.resize(lrmat_test.get_U().nb_rows(), lrmat_test.get_V().nb_cols()); lrmat_test.copy_to_dense(dense_lrmat_test.data()); error = normFrob(matrix_result_w_lrmat_sum - dense_lrmat_test) / normFrob(matrix_result_w_lrmat_sum); - is_error = is_error || !(error < std::max(epsilon, lrmat_tolerance) * margin); + is_error = is_error || !(error < std::max(epsilon, lrmat_tolerance) * (1 + additional_lrmat_sum_tolerance)); cout << "> Errors on a matrix hmatrix product to lrmat with sum: " << error << endl; return is_error;