From 93e9b9b2b3496cd68d8115b16823ad7a6f6a5510 Mon Sep 17 00:00:00 2001 From: Edgar Solomonik Date: Wed, 11 May 2016 13:43:38 +0200 Subject: [PATCH] Made intermediates formed in contraction expressions be smaller when possible, incremented version to 1.34 --- include/ctf.hpp | 2 +- src/interface/fun_term.cxx | 4 +- src/interface/fun_term.h | 4 +- src/interface/idx_tensor.cxx | 6 +-- src/interface/idx_tensor.h | 4 +- src/interface/schedule.cxx | 44 ++++++++-------- src/interface/schedule.h | 4 +- src/interface/term.cxx | 97 ++++++++++++++++++++++++++++++++++-- src/interface/term.h | 19 ++----- 9 files changed, 131 insertions(+), 53 deletions(-) diff --git a/include/ctf.hpp b/include/ctf.hpp index a7365e69..22822ba1 100644 --- a/include/ctf.hpp +++ b/include/ctf.hpp @@ -12,7 +12,7 @@ #include #include -#define CTF_VERSION 133 +#define CTF_VERSION 134 #include "../src/interface/tensor.h" #include "../src/interface/idx_tensor.h" diff --git a/src/interface/fun_term.cxx b/src/interface/fun_term.cxx index 5c531fb0..3f298d0c 100644 --- a/src/interface/fun_term.cxx +++ b/src/interface/fun_term.cxx @@ -55,7 +55,7 @@ namespace CTF_int { } - void Unifun_Term::get_inputs(std::set* inputs_set) const { + void Unifun_Term::get_inputs(std::set* inputs_set) const { A->get_inputs(inputs_set); } @@ -131,7 +131,7 @@ namespace CTF_int { } - void Bifun_Term::get_inputs(std::set* inputs_set) const { + void Bifun_Term::get_inputs(std::set* inputs_set) const { A->get_inputs(inputs_set); B->get_inputs(inputs_set); } diff --git a/src/interface/fun_term.h b/src/interface/fun_term.h index 3379e0fe..7f0e14cd 100644 --- a/src/interface/fun_term.h +++ b/src/interface/fun_term.h @@ -32,7 +32,7 @@ namespace CTF_int { double estimate_time(CTF::Idx_Tensor output) const; - void get_inputs(std::set* inputs_set) const; + void get_inputs(std::set* inputs_set) const; CTF::World * where_am_i() const; }; @@ -62,7 +62,7 @@ namespace CTF_int { double estimate_time(CTF::Idx_Tensor output) const; - void get_inputs(std::set* inputs_set) const; + void get_inputs(std::set* inputs_set) const; CTF::World * where_am_i() const; }; diff --git a/src/interface/idx_tensor.cxx b/src/interface/idx_tensor.cxx index f55b02e8..afae327a 100644 --- a/src/interface/idx_tensor.cxx +++ b/src/interface/idx_tensor.cxx @@ -299,10 +299,8 @@ namespace CTF { return *this; } - void Idx_Tensor::get_inputs(std::set* inputs_set) const { - if (parent) { - inputs_set->insert(parent); - } + void Idx_Tensor::get_inputs(std::set* inputs_set) const { + inputs_set->insert((Idx_Tensor*)this); } /*template diff --git a/src/interface/idx_tensor.h b/src/interface/idx_tensor.h index f24ba8fc..d7fbb22f 100644 --- a/src/interface/idx_tensor.h +++ b/src/interface/idx_tensor.h @@ -86,7 +86,7 @@ namespace CTF { /** * \brief appends the tensors this depends on to the input set */ - void get_inputs(std::set< CTF_int::tensor*, CTF_int::tensor_tid_less >* inputs_set) const; + void get_inputs(std::set* inputs_set) const; /** * \brief A = B, compute any operations on operand B and set @@ -319,6 +319,4 @@ namespace CTF { * @} */ } - - #endif diff --git a/src/interface/schedule.cxx b/src/interface/schedule.cxx index 145d4675..38bbc1fc 100644 --- a/src/interface/schedule.cxx +++ b/src/interface/schedule.cxx @@ -37,11 +37,11 @@ namespace CTF { World * world; std::vector ops; // operations to execute - std::set local_tensors; // all local tensors used + std::set local_tensors; // all local tensors used std::map remap; // mapping from global tensor -> local tensor - std::set global_tensors; // all referenced tensors stored as global tensors - std::set output_tensors; // tensors to be written back out, stored as global tensors + std::set global_tensors; // all referenced tensors stored as global tensors + std::set output_tensors; // tensors to be written back out, stored as global tensors }; ScheduleTimer Schedule::partition_and_execute() { @@ -156,21 +156,21 @@ namespace CTF { // Create and communicate tensors to subworlds schedule_timer.comm_down_time = MPI_Wtime(); for (comm_op_iter=comm_ops.begin(); comm_op_iter!=comm_ops.end(); comm_op_iter++) { - typename std::set::iterator global_tensor_iter; + typename std::set::iterator global_tensor_iter; for (global_tensor_iter=comm_op_iter->global_tensors.begin(); global_tensor_iter!=comm_op_iter->global_tensors.end(); global_tensor_iter++) { - tensor* local_clone; + Idx_Tensor* local_clone; if (comm_op_iter->world != NULL) { - local_clone = new tensor(*(*global_tensor_iter));//, *comm_op_iter->world); + local_clone = new Idx_Tensor(*(*global_tensor_iter));//, *comm_op_iter->world); } else { local_clone = NULL; } comm_op_iter->local_tensors.insert(local_clone); - comm_op_iter->remap[*global_tensor_iter] = local_clone; - (*global_tensor_iter)->add_to_subworld(local_clone, (*global_tensor_iter)->sr->mulid(), (*global_tensor_iter)->sr->addid()); + comm_op_iter->remap[(*global_tensor_iter)->parent] = local_clone->parent; + (*global_tensor_iter)->parent->add_to_subworld(local_clone->parent, (*global_tensor_iter)->sr->mulid(), (*global_tensor_iter)->sr->addid()); } - typename std::set::iterator output_tensor_iter; + typename std::set::iterator output_tensor_iter; for (output_tensor_iter=comm_op_iter->output_tensors.begin(); output_tensor_iter!=comm_op_iter->output_tensors.end(); output_tensor_iter++) { - assert(comm_op_iter->remap.find(*output_tensor_iter) != comm_op_iter->remap.end()); + assert(comm_op_iter->remap.find((*output_tensor_iter)->parent) != comm_op_iter->remap.end()); } } schedule_timer.comm_down_time = MPI_Wtime() - schedule_timer.comm_down_time; @@ -201,16 +201,16 @@ namespace CTF { // Communicate results back into global schedule_timer.comm_up_time = MPI_Wtime(); for (comm_op_iter=comm_ops.begin(); comm_op_iter!=comm_ops.end(); comm_op_iter++) { - typename std::set::iterator output_tensor_iter; + typename std::set::iterator output_tensor_iter; for (output_tensor_iter=comm_op_iter->output_tensors.begin(); output_tensor_iter!=comm_op_iter->output_tensors.end(); output_tensor_iter++) { - (*output_tensor_iter)->add_from_subworld(comm_op_iter->remap[*output_tensor_iter], (*output_tensor_iter)->sr->mulid(), (*output_tensor_iter)->sr->addid()); + (*output_tensor_iter)->parent->add_from_subworld(comm_op_iter->remap[(*output_tensor_iter)->parent], (*output_tensor_iter)->sr->mulid(), (*output_tensor_iter)->sr->addid()); } } schedule_timer.comm_up_time = MPI_Wtime() - schedule_timer.comm_up_time; // Clean up local tensors & world if ((int64_t)comm_ops.size() > my_color) { - typename std::set::iterator local_tensor_iter; + typename std::set::iterator local_tensor_iter; for (local_tensor_iter=comm_ops[my_color].local_tensors.begin(); local_tensor_iter!=comm_ops[my_color].local_tensors.end(); local_tensor_iter++) { delete *local_tensor_iter; } @@ -279,17 +279,17 @@ namespace CTF { void Schedule::add_operation_typed(TensorOperation* op) { steps_original.push_back(op); - std::set op_lhs_set; + std::set op_lhs_set; op->get_outputs(&op_lhs_set); assert(op_lhs_set.size() == 1); // limited case to make this a bit easier - tensor* op_lhs = *op_lhs_set.begin(); + tensor* op_lhs = (*op_lhs_set.begin())->parent; - std::set op_deps; + std::set op_deps; op->get_inputs(&op_deps); - typename std::set::iterator deps_iter; + typename std::set::iterator deps_iter; for (deps_iter = op_deps.begin(); deps_iter != op_deps.end(); deps_iter++) { - tensor* dep = *deps_iter; + tensor* dep = (*deps_iter)->parent; typename std::map::iterator dep_loc = latest_write.find(dep); TensorOperation* dep_op; if (dep_loc != latest_write.end()) { @@ -363,13 +363,13 @@ namespace CTF { } } - void TensorOperation::get_outputs(std::set* outputs_set) const { + void TensorOperation::get_outputs(std::set* outputs_set) const { assert(lhs->parent); assert(outputs_set != NULL); - outputs_set->insert(lhs->parent); + outputs_set->insert(lhs); } - void TensorOperation::get_inputs(std::set* inputs_set) const { + void TensorOperation::get_inputs(std::set* inputs_set) const { rhs->get_inputs(inputs_set); switch (op) { @@ -379,7 +379,7 @@ namespace CTF { case TENSOR_OP_SUBTRACT: case TENSOR_OP_MULTIPLY: assert(lhs->parent != NULL); - inputs_set->insert(lhs->parent); + inputs_set->insert(lhs); break; default: std::cerr << "TensorOperation::get_inputs(): unexpected op: " << op << std::endl; diff --git a/src/interface/schedule.h b/src/interface/schedule.h index 2fcafef5..3e897c56 100644 --- a/src/interface/schedule.h +++ b/src/interface/schedule.h @@ -48,13 +48,13 @@ namespace CTF { /** * \brief appends the tensors this writes to to the input set */ - void get_outputs(std::set* outputs_set) const; + void get_outputs(std::set* outputs_set) const; /** * \brief appends the tensors this depends on (reads from, including the output * if a previous value is required) to the input set */ - void get_inputs(std::set* inputs_set) const; + void get_inputs(std::set* inputs_set) const; /** * \brief runs this operation, but does NOT handle dependency scheduling diff --git a/src/interface/term.cxx b/src/interface/term.cxx index e3800fbc..ea148314 100644 --- a/src/interface/term.cxx +++ b/src/interface/term.cxx @@ -38,6 +38,7 @@ namespace CTF_int { } } } + idx_C = (char*)alloc(sizeof(char)*order_C); sym_C = (int*)alloc(sizeof(int)*order_C); @@ -92,6 +93,61 @@ namespace CTF_int { free(len_C); free(idx_C); return out; + + } + + Idx_Tensor * get_full_intm(Idx_Tensor& A, + Idx_Tensor& B, + int num_out_inds, + char const * out_inds){ + int * len_C, * sym_C; + char * idx_C; + int order_C, i, j; + + idx_C = (char*)alloc(sizeof(char)*num_out_inds); + sym_C = (int*)alloc(sizeof(int)*num_out_inds); + len_C = (int*)alloc(sizeof(int)*num_out_inds); + order_C = 0; + for (j=0; jorder; i++){ + if (A.idx_map[i] == out_inds[j]){ + found = true; + len = A.parent->lens[i]; + if (sym_prev != -1) sym_prev = NS; + else if (i>0 && order_C>0 && A.idx_map[i-1] == idx_C[order_C-1]) sym_prev = A.parent->sym[i-1]; + else sym_prev = NS; + } + } + if (!found){ + for (i=0; iorder; i++){ + if (B.idx_map[i] == out_inds[j]){ + found = true; + len = B.parent->lens[i]; + if (sym_prev != NS && i>0 && order_C>0 && B.idx_map[i-1] == idx_C[order_C-1]) sym_prev = B.parent->sym[i-1]; + else sym_prev = NS; + + } + } + } + if (found){ + idx_C[order_C] = out_inds[j]; + len_C[order_C] = len; + if (sym_prev > 0) + sym_C[order_C-1] = sym_prev; + sym_C[order_C] = NS; + order_C++; + } + } + tensor * tsr_C = new tensor(A.parent->sr, order_C, len_C, sym_C, A.parent->wrld, 1); + Idx_Tensor * out = new Idx_Tensor(tsr_C, idx_C); + out->is_intm = 1; + free(sym_C); + free(len_C); + free(idx_C); + return out; } @@ -366,7 +422,7 @@ namespace CTF_int { } - void Sum_Term::get_inputs(std::set* inputs_set) const { + void Sum_Term::get_inputs(std::set* inputs_set) const { for (int i=0; i<(int)operands.size(); i++){ operands[i]->get_inputs(inputs_set); } @@ -455,7 +511,22 @@ namespace CTF_int { sr->safemul(op_A.scale, op_B.scale, op_A.scale); tmp_ops.push_back(op_A.clone()); } else { - Idx_Tensor * intm = get_full_intm(op_A, op_B); + std::set uniq_inds; + for (int k=0; korder; k++){ + uniq_inds.insert(output.idx_map[k]); + } + std::set inputs; + for (int j=0; j<(int)tmp_ops.size(); j++){ + tmp_ops[j]->get_inputs(&inputs); + } + for (std::set::iterator j=inputs.begin(); j!=inputs.end(); j++){ + for (int k=0; k<(*j)->parent->order; k++){ + uniq_inds.insert((*j)->idx_map[k]); + } + } + std::vector arr(uniq_inds.begin(), uniq_inds.end()); + + Idx_Tensor * intm = get_full_intm(op_A, op_B, uniq_inds.size(), &(arr[0])); sr->safemul(tscale, op_A.scale, tscale); sr->safemul(tscale, op_B.scale, tscale); contraction c(op_A.parent, op_A.idx_map, @@ -527,6 +598,7 @@ namespace CTF_int { sr->safemul(op_B.scale, op_A.scale, op_A.scale); tmp_ops.push_back(op_A.clone()); } else { + printf("HERE2\n"); Idx_Tensor * intm = get_full_intm(op_A, op_B); sr->safemul(tscale, op_A.scale, tscale); sr->safemul(tscale, op_B.scale, tscale); @@ -644,7 +716,7 @@ namespace CTF_int { - void Contract_Term::get_inputs(std::set* inputs_set) const { + void Contract_Term::get_inputs(std::set* inputs_set) const { for (int i=0; i<(int)operands.size(); i++){ operands[i]->get_inputs(inputs_set); } @@ -678,3 +750,22 @@ namespace CTF_int { } + +namespace CTF_int { + bool tensor_name_less::operator()(CTF::Idx_Tensor* A, CTF::Idx_Tensor* B) { + int d = strcmp(A->parent->name, B->parent->name); + if (d>0) return d; else return 1; + /*if (A == NULL && B != NULL) { + return true; + } else if (A == NULL || B == NULL) { + return false; + } + assert(0);//FIXME + //return A->tid < B->tid; + return -1;*/ + } +} + + + + diff --git a/src/interface/term.h b/src/interface/term.h index bb488a1b..49fd9be6 100644 --- a/src/interface/term.h +++ b/src/interface/term.h @@ -22,17 +22,8 @@ namespace CTF_int { * \brief comparison function for sets of tensor pointers * This ensures the set iteration order is consistent across nodes */ - struct tensor_tid_less { - bool operator()(tensor* A, tensor* B) { - if (A == NULL && B != NULL) { - return true; - } else if (A == NULL || B == NULL) { - return false; - } - assert(0);//FIXME - //return A->tid < B->tid; - return -1; - } + struct tensor_name_less { + bool operator()(CTF::Idx_Tensor* A, CTF::Idx_Tensor* B); }; @@ -84,7 +75,7 @@ namespace CTF_int { /** * \brief appends the tensors this depends on to the input set */ - virtual void get_inputs(std::set* inputs_set) const = 0; + virtual void get_inputs(std::set* inputs_set) const = 0; /** * \brief constructs a new term which multiplies by tensor A @@ -217,7 +208,7 @@ namespace CTF_int { /** * \brief appends the tensors this depends on to the input set */ - void get_inputs(std::set* inputs_set) const; + void get_inputs(std::set* inputs_set) const; /** * \brief constructs a new term by addition of two terms @@ -278,7 +269,7 @@ namespace CTF_int { /** * \brief appends the tensors this depends on to the input set */ - void get_inputs(std::set* inputs_set) const; + void get_inputs(std::set* inputs_set) const; /** * \brief evalues the expression to produce an intermediate with