From f61f555589c3f22ba60e305bb6b1510a4411da12 Mon Sep 17 00:00:00 2001 From: Edward Hutter Date: Tue, 19 Sep 2023 08:22:30 -0500 Subject: [PATCH 1/2] Initial commit for removing parameter from LinModel instantiations --- src/contraction/ctr_tsr.cxx | 16 +-- src/contraction/spctr_tsr.cxx | 46 ++++----- src/interface/common.cxx | 34 +++---- src/redistribution/dgtog_redist.cxx | 4 +- src/redistribution/nosym_transp.cxx | 10 +- src/shared/model.cxx | 152 ++++++++++++++-------------- src/shared/model.h | 8 +- src/shared/offload.cu | 12 +-- src/tensor/algstrct.cxx | 8 +- src/tensor/untyped_tensor.cxx | 6 +- 10 files changed, 151 insertions(+), 145 deletions(-) diff --git a/src/contraction/ctr_tsr.cxx b/src/contraction/ctr_tsr.cxx index 45cc994a..c78f5cac 100755 --- a/src/contraction/ctr_tsr.cxx +++ b/src/contraction/ctr_tsr.cxx @@ -435,7 +435,7 @@ namespace CTF_int { double seq_tsr_ctr::est_time_fp(int nlyr){ //return COST_MEMBW*(size_A+size_B+size_C)+COST_FLOP*flops; - double ps[] = {1.0, (double)est_membw(), est_fp()}; + double ps[] = {(double)est_membw(), est_fp()}; // printf("time estimate is %lf\n", seq_tsr_ctr_mdl.est_time(ps)); if (is_custom && !is_inner){ return seq_tsr_ctr_mdl_cst.est_time(ps); @@ -468,11 +468,11 @@ namespace CTF_int { // Check if we need to execute this function for the sake of training bool sr; if (is_custom && !is_inner){ - double tps[] = {0, 1.0, (double)est_membw(), est_fp()}; + double tps[] = {0, (double)est_membw(), est_fp()}; sr = seq_tsr_ctr_mdl_cst.should_observe(tps); } else if (is_inner){ ASSERT(is_custom || func == NULL); - double tps[] = {0.0, 1.0, (double)est_membw(), est_fp()}; + double tps[] = {0.0, (double)est_membw(), est_fp()}; if (is_custom){ if (inner_params.offload) sr = seq_tsr_ctr_mdl_cst_off.should_observe(tps); @@ -486,7 +486,7 @@ namespace CTF_int { } } else { - double tps[] = {0.0, 1.0, (double)est_membw(), est_fp()}; + double tps[] = {0.0, (double)est_membw(), est_fp()}; sr = seq_tsr_ctr_mdl_ref.should_observe(tps); } @@ -517,11 +517,11 @@ namespace CTF_int { idx_map_C, func); double exe_time = MPI_Wtime()-st_time; - double tps[] = {exe_time, 1.0, (double)est_membw(), est_fp()}; + double tps[] = {exe_time, (double)est_membw(), est_fp()}; seq_tsr_ctr_mdl_cst.observe(tps); } else if (is_inner){ ASSERT(is_custom || func == NULL); -// double ps[] = {1.0, (double)est_membw(), est_fp()}; +// double ps[] = {(double)est_membw(), est_fp()}; // double est_time = seq_tsr_ctr_mdl_inr.est_time(ps); double st_time = MPI_Wtime(); sym_seq_ctr_inr(this->alpha, @@ -548,7 +548,7 @@ namespace CTF_int { func); double exe_time = MPI_Wtime()-st_time; // printf("exe_time = %E est_time = %E abs_err = %e rel_err = %lf\n", exe_time,est_time,fabs(exe_time-est_time),fabs(exe_time-est_time)/exe_time); - double tps[] = {exe_time, 1.0, (double)est_membw(), est_fp()}; + double tps[] = {exe_time, (double)est_membw(), est_fp()}; if (is_custom){ if (inner_params.offload) seq_tsr_ctr_mdl_cst_off.observe(tps); @@ -584,7 +584,7 @@ namespace CTF_int { sym_C, idx_map_C); double exe_time = MPI_Wtime()-st_time; - double tps[] = {exe_time, 1.0, (double)est_membw(), est_fp()}; + double tps[] = {exe_time, (double)est_membw(), est_fp()}; seq_tsr_ctr_mdl_ref.observe(tps); } } diff --git a/src/contraction/spctr_tsr.cxx b/src/contraction/spctr_tsr.cxx index 078731cc..5ccde546 100644 --- a/src/contraction/spctr_tsr.cxx +++ b/src/contraction/spctr_tsr.cxx @@ -234,28 +234,28 @@ namespace CTF_int { return size_A+size_B+size_C; } - LinModel<3> seq_tsr_spctr_cst_off_k0(seq_tsr_spctr_cst_off_k0_init,"seq_tsr_spctr_cst_off_k0"); - LinModel<3> seq_tsr_spctr_cst_off_k1(seq_tsr_spctr_cst_off_k1_init,"seq_tsr_spctr_cst_off_k1"); - LinModel<3> seq_tsr_spctr_cst_off_k2(seq_tsr_spctr_cst_off_k2_init,"seq_tsr_spctr_cst_off_k2"); - LinModel<3> seq_tsr_spctr_off_k0(seq_tsr_spctr_off_k0_init,"seq_tsr_spctr_off_k0"); - LinModel<3> seq_tsr_spctr_off_k1(seq_tsr_spctr_off_k1_init,"seq_tsr_spctr_off_k1"); - LinModel<3> seq_tsr_spctr_off_k2(seq_tsr_spctr_off_k2_init,"seq_tsr_spctr_off_k2"); - LinModel<3> seq_tsr_spctr_cst_k0(seq_tsr_spctr_cst_k0_init,"seq_tsr_spctr_cst_k0"); - LinModel<3> seq_tsr_spctr_cst_k1(seq_tsr_spctr_cst_k1_init,"seq_tsr_spctr_cst_k1"); - LinModel<3> seq_tsr_spctr_cst_k2(seq_tsr_spctr_cst_k2_init,"seq_tsr_spctr_cst_k2"); - LinModel<3> seq_tsr_spctr_cst_k3(seq_tsr_spctr_cst_k3_init,"seq_tsr_spctr_cst_k3"); - LinModel<3> seq_tsr_spctr_cst_k4(seq_tsr_spctr_cst_k4_init,"seq_tsr_spctr_cst_k4"); - LinModel<3> seq_tsr_spctr_cst_k5(seq_tsr_spctr_cst_k5_init,"seq_tsr_spctr_cst_k5"); - LinModel<3> seq_tsr_spctr_k0(seq_tsr_spctr_k0_init,"seq_tsr_spctr_k0"); - LinModel<3> seq_tsr_spctr_k1(seq_tsr_spctr_k1_init,"seq_tsr_spctr_k1"); - LinModel<3> seq_tsr_spctr_k2(seq_tsr_spctr_k2_init,"seq_tsr_spctr_k2"); - LinModel<3> seq_tsr_spctr_k3(seq_tsr_spctr_k3_init,"seq_tsr_spctr_k3"); - LinModel<3> seq_tsr_spctr_k4(seq_tsr_spctr_k4_init,"seq_tsr_spctr_k4"); - LinModel<3> seq_tsr_spctr_k5(seq_tsr_spctr_k5_init,"seq_tsr_spctr_k5"); + LinModel<2> seq_tsr_spctr_cst_off_k0(seq_tsr_spctr_cst_off_k0_init,"seq_tsr_spctr_cst_off_k0"); + LinModel<2> seq_tsr_spctr_cst_off_k1(seq_tsr_spctr_cst_off_k1_init,"seq_tsr_spctr_cst_off_k1"); + LinModel<2> seq_tsr_spctr_cst_off_k2(seq_tsr_spctr_cst_off_k2_init,"seq_tsr_spctr_cst_off_k2"); + LinModel<2> seq_tsr_spctr_off_k0(seq_tsr_spctr_off_k0_init,"seq_tsr_spctr_off_k0"); + LinModel<2> seq_tsr_spctr_off_k1(seq_tsr_spctr_off_k1_init,"seq_tsr_spctr_off_k1"); + LinModel<2> seq_tsr_spctr_off_k2(seq_tsr_spctr_off_k2_init,"seq_tsr_spctr_off_k2"); + LinModel<2> seq_tsr_spctr_cst_k0(seq_tsr_spctr_cst_k0_init,"seq_tsr_spctr_cst_k0"); + LinModel<2> seq_tsr_spctr_cst_k1(seq_tsr_spctr_cst_k1_init,"seq_tsr_spctr_cst_k1"); + LinModel<2> seq_tsr_spctr_cst_k2(seq_tsr_spctr_cst_k2_init,"seq_tsr_spctr_cst_k2"); + LinModel<2> seq_tsr_spctr_cst_k3(seq_tsr_spctr_cst_k3_init,"seq_tsr_spctr_cst_k3"); + LinModel<2> seq_tsr_spctr_cst_k4(seq_tsr_spctr_cst_k4_init,"seq_tsr_spctr_cst_k4"); + LinModel<2> seq_tsr_spctr_cst_k5(seq_tsr_spctr_cst_k5_init,"seq_tsr_spctr_cst_k5"); + LinModel<2> seq_tsr_spctr_k0(seq_tsr_spctr_k0_init,"seq_tsr_spctr_k0"); + LinModel<2> seq_tsr_spctr_k1(seq_tsr_spctr_k1_init,"seq_tsr_spctr_k1"); + LinModel<2> seq_tsr_spctr_k2(seq_tsr_spctr_k2_init,"seq_tsr_spctr_k2"); + LinModel<2> seq_tsr_spctr_k3(seq_tsr_spctr_k3_init,"seq_tsr_spctr_k3"); + LinModel<2> seq_tsr_spctr_k4(seq_tsr_spctr_k4_init,"seq_tsr_spctr_k4"); + LinModel<2> seq_tsr_spctr_k5(seq_tsr_spctr_k5_init,"seq_tsr_spctr_k5"); double seq_tsr_spctr::est_time_fp(int nlyr, int nblk_A, int nblk_B, int nblk_C, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C){ // return COST_MEMBW*(size_A+size_B+size_C)+COST_FLOP*flops; - double ps[] = {1.0, (double)est_membw(nnz_frac_A, nnz_frac_B, nnz_frac_C), est_fp(nnz_frac_A, nnz_frac_B, nnz_frac_C)}; + double ps[] = {(double)est_membw(nnz_frac_A, nnz_frac_B, nnz_frac_C), est_fp(nnz_frac_A, nnz_frac_B, nnz_frac_C)}; switch (krnl_type){ case 0: if (is_custom){ @@ -995,22 +995,22 @@ namespace CTF_int { } } - LinModel<2> pin_keys_mdl(pin_keys_mdl_init,"pin_keys_mdl"); + LinModel<1> pin_keys_mdl(pin_keys_mdl_init,"pin_keys_mdl"); double spctr_pin_keys::est_time_fp(int nlyr, int nblk_A, int nblk_B, int nblk_C, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C) { switch (AxBxC){ case 0: { - double ps[] = {1.0, dns_blk_sz*nnz_frac_A}; + double ps[] = {dns_blk_sz*nnz_frac_A}; return pin_keys_mdl.est_time(ps); } case 1: { - double ps[] = {1.0, dns_blk_sz*nnz_frac_B}; + double ps[] = {dns_blk_sz*nnz_frac_B}; return pin_keys_mdl.est_time(ps); } case 2: { - double ps[] = {1.0, dns_blk_sz*nnz_frac_C}; + double ps[] = {dns_blk_sz*nnz_frac_C}; return 2.*pin_keys_mdl.est_time(ps); } } diff --git a/src/interface/common.cxx b/src/interface/common.cxx index df88ba85..a64e1e7e 100644 --- a/src/interface/common.cxx +++ b/src/interface/common.cxx @@ -47,19 +47,19 @@ namespace CTF_int { //static double init_mdl[] = {COST_LATENCY, COST_LATENCY, COST_NETWBW}; - LinModel<3> alltoall_mdl(alltoall_mdl_init,"alltoall_mdl"); - LinModel<3> alltoallv_mdl(alltoallv_mdl_init,"alltoallv_mdl"); + LinModel<2> alltoall_mdl(alltoall_mdl_init,"alltoall_mdl"); + LinModel<2> alltoallv_mdl(alltoallv_mdl_init,"alltoallv_mdl"); #ifdef BGQ //static double init_lg_mdl[] = {COST_LATENCY, COST_LATENCY, 0.0, COST_NETWBW + 2.0*COST_MEMBW}; #else //static double init_lg_mdl[] = {COST_LATENCY, COST_LATENCY, COST_NETWBW + 2.0*COST_MEMBW, 0.0}; #endif - LinModel<3> red_mdl(red_mdl_init,"red_mdl"); - LinModel<3> red_mdl_cst(red_mdl_cst_init,"red_mdl_cst"); - LinModel<3> allred_mdl(allred_mdl_init,"allred_mdl"); - LinModel<3> allred_mdl_cst(allred_mdl_cst_init,"allred_mdl_cst"); - LinModel<3> bcast_mdl(bcast_mdl_init,"bcast_mdl"); + LinModel<2> red_mdl(red_mdl_init,"red_mdl"); + LinModel<2> red_mdl_cst(red_mdl_cst_init,"red_mdl_cst"); + LinModel<2> allred_mdl(allred_mdl_init,"allred_mdl"); + LinModel<2> allred_mdl_cst(allred_mdl_cst_init,"allred_mdl_cst"); + LinModel<2> bcast_mdl(bcast_mdl_init,"bcast_mdl"); template @@ -369,12 +369,12 @@ namespace CTF_int { double CommData::estimate_alltoall_time(int64_t chunk_sz) { - double ps[] = {1.0, log2((double)np), log2((double)np)*np*chunk_sz}; + double ps[] = {log2((double)np), log2((double)np)*np*chunk_sz}; return alltoall_mdl.est_time(ps); } double CommData::estimate_alltoallv_time(int64_t tot_sz) { - double ps[] = {1.0, log2((double)np), log2((double)np)*tot_sz}; + double ps[] = {log2((double)np), log2((double)np)*tot_sz}; return alltoallv_mdl.est_time(ps); } @@ -385,7 +385,7 @@ namespace CTF_int { int tsize_; MPI_Type_size(mdtype, &tsize_); - double tps_[] = {0.0, 1.0, log2(np), ((double)count)*tsize_}; + double tps_[] = {0.0, log2(np), ((double)count)*tsize_}; if (!bcast_mdl.should_observe(tps_)) return; #endif @@ -398,7 +398,7 @@ namespace CTF_int { double exe_time = MPI_Wtime()-st_time; int tsize; MPI_Type_size(mdtype, &tsize); - double tps[] = {exe_time, 1.0, log2(np), ((double)count)*tsize}; + double tps[] = {exe_time, log2(np), ((double)count)*tsize}; bcast_mdl.observe(tps); #endif } @@ -411,7 +411,7 @@ namespace CTF_int { #ifdef TUNE int tsize_; MPI_Type_size(mdtype, &tsize_); - double tps_[] = {0.0, 1.0, log2(np), ((double)count)*tsize_*std::max(.5,(double)log2(np))}; + double tps_[] = {0.0, log2(np), ((double)count)*tsize_*std::max(.5,(double)log2(np))}; bool bsr = true; if (op >= MPI_MAX && op <= MPI_REPLACE) bsr = allred_mdl.should_observe(tps_); @@ -428,7 +428,7 @@ namespace CTF_int { double exe_time = MPI_Wtime()-st_time; int tsize; MPI_Type_size(mdtype, &tsize); - double tps[] = {exe_time, 1.0, log2(np), ((double)count)*tsize*std::max(.5,(double)log2(np))}; + double tps[] = {exe_time, log2(np), ((double)count)*tsize*std::max(.5,(double)log2(np))}; if (op >= MPI_MAX && op <= MPI_REPLACE) allred_mdl.observe(tps); else @@ -442,7 +442,7 @@ namespace CTF_int { // change-of-observe int tsize_; MPI_Type_size(mdtype, &tsize_); - double tps_[] = {0.0, 1.0, log2(np), ((double)count)*tsize_*std::max(.5,(double)log2(np))}; + double tps_[] = {0.0, log2(np), ((double)count)*tsize_*std::max(.5,(double)log2(np))}; bool bsr = true; if (op >= MPI_MAX && op <= MPI_REPLACE) bsr = red_mdl.should_observe(tps_); @@ -459,7 +459,7 @@ namespace CTF_int { double exe_time = MPI_Wtime()-st_time; int tsize; MPI_Type_size(mdtype, &tsize); - double tps[] = {exe_time, 1.0, log2(np), ((double)count)*tsize*std::max(.5,(double)log2(np))}; + double tps[] = {exe_time, log2(np), ((double)count)*tsize*std::max(.5,(double)log2(np))}; if (op >= MPI_MAX && op <= MPI_REPLACE) red_mdl.observe(tps); else @@ -479,7 +479,7 @@ namespace CTF_int { MPI_Barrier(cm); // change-of-observe int64_t tot_sz_ = std::max(send_displs[np-1]+send_counts[np-1], recv_displs[np-1]+recv_counts[np-1])*datum_size; - double tps_[] = {0.0, 1.0, log2(np), (double)tot_sz_}; + double tps_[] = {0.0, log2(np), (double)tot_sz_}; if (!alltoallv_mdl.should_observe(tps_)) return; #endif @@ -572,7 +572,7 @@ namespace CTF_int { #endif double exe_time = MPI_Wtime()-st_time; int64_t tot_sz = std::max(send_displs[np-1]+send_counts[np-1], recv_displs[np-1]+recv_counts[np-1])*datum_size; - double tps[] = {exe_time, 1.0, log2(np), (double)tot_sz}; + double tps[] = {exe_time, log2(np), (double)tot_sz}; alltoallv_mdl.observe(tps); } diff --git a/src/redistribution/dgtog_redist.cxx b/src/redistribution/dgtog_redist.cxx index fffee025..73ab3e01 100644 --- a/src/redistribution/dgtog_redist.cxx +++ b/src/redistribution/dgtog_redist.cxx @@ -6,10 +6,10 @@ #include "dgtog_bucket.h" namespace CTF_int { //static double init_mdl[] = {COST_LATENCY, COST_LATENCY, COST_NETWBW}; - LinModel<3> dgtog_res_mdl(dgtog_res_mdl_init,"dgtog_res_mdl"); + LinModel<2> dgtog_res_mdl(dgtog_res_mdl_init,"dgtog_res_mdl"); double dgtog_est_time(int64_t tot_sz, int np){ - double ps[] = {1.0, (double)log2(np), (double)tot_sz*log2(np)}; + double ps[] = {(double)log2(np), (double)tot_sz*log2(np)}; return dgtog_res_mdl.est_time(ps); } } diff --git a/src/redistribution/nosym_transp.cxx b/src/redistribution/nosym_transp.cxx index e312a0ca..0faf7419 100644 --- a/src/redistribution/nosym_transp.cxx +++ b/src/redistribution/nosym_transp.cxx @@ -12,9 +12,9 @@ namespace CTF_int { //static double init_ct_ps[] = {COST_LATENCY, 1.5*COST_MEMBW}; - LinModel<2> long_contig_transp_mdl(long_contig_transp_mdl_init,"long_contig_transp_mdl"); - LinModel<2> shrt_contig_transp_mdl(shrt_contig_transp_mdl_init,"shrt_contig_transp_mdl"); - LinModel<2> non_contig_transp_mdl(non_contig_transp_mdl_init,"non_contig_transp_mdl"); + LinModel<1> long_contig_transp_mdl(long_contig_transp_mdl_init,"long_contig_transp_mdl"); + LinModel<1> shrt_contig_transp_mdl(shrt_contig_transp_mdl_init,"shrt_contig_transp_mdl"); + LinModel<1> non_contig_transp_mdl(non_contig_transp_mdl_init,"non_contig_transp_mdl"); //#define OPT_NOSYM_TR @@ -445,7 +445,7 @@ namespace CTF_int { } tot_sz *= nvirt_A; - double tps[] = {0.0, 1.0, (double)tot_sz}; + double tps[] = {0.0, (double)tot_sz}; bool should_run = true; if (contig0 < 4){ should_run = non_contig_transp_mdl.should_observe(tps); @@ -507,7 +507,7 @@ namespace CTF_int { tot_sz *= nvirt_A; double exe_time = MPI_Wtime() - st_time; - double tps[] = {exe_time, 1.0, (double)tot_sz}; + double tps[] = {exe_time, (double)tot_sz}; if (contig0 < 4){ non_contig_transp_mdl.observe(tps); } else if (contig0 <= 64){ diff --git a/src/shared/model.cxx b/src/shared/model.cxx index ef17ebf4..0bae0939 100644 --- a/src/shared/model.cxx +++ b/src/shared/model.cxx @@ -105,7 +105,7 @@ namespace CTF_int { template struct time_param { - double p[nparam+1]; + double p[nparam+1]; // '+1' is to include observed execution time }; template @@ -121,7 +121,7 @@ namespace CTF_int { //initialize the model as active by default is_active = true; //copy initial static coefficients to initialzie model (defined in init_model.cxx) - memcpy(coeff_guess, init_guess, nparam*sizeof(double)); + memcpy(coeff_guess, init_guess, this->num_model_coefficients*sizeof(double)); name = (char*)malloc(strlen(name_)+1); name[0] = '\0'; strcpy(name, name_); @@ -130,7 +130,7 @@ namespace CTF_int { regularization[i] = coeff_guess[i]*REG_LAMBDA; }*/ hist_size = hist_size_; - mat_lda = nparam+1; + mat_lda = nparam+2;// includes execution time and coefficient (1.0) of bias term. time_param_mat = (double*)malloc(mat_lda*hist_size*sizeof(double)); nobs = 0; is_tuned = false; @@ -192,7 +192,10 @@ namespace CTF_int { assert(tp[0] >= 0.0); // Add the new instance of run process into time_param_mat - memcpy(time_param_mat+(nobs%hist_size)*mat_lda, tp, mat_lda*sizeof(double)); + // This includes observed execution time, 1. for a bias term, and the observed input parameters. + time_param_mat[(nobs%hist_size)*mat_lda] = tp[0]; + time_param_mat[(nobs%hist_size)*mat_lda + 1] = 1.; + memcpy(time_param_mat+(nobs%hist_size)*mat_lda + 2, tp+2, nparam*sizeof(double)); /* if (nobs < hist_size){ memcpy(time_param_mat+nobs*mat_lda, tp, mat_lda*sizeof(double)); } else { @@ -224,7 +227,7 @@ namespace CTF_int { template void LinModel::update(MPI_Comm cm){ #ifdef TUNE - double S[nparam]; + double S[this->num_model_coefficients]; int lwork, liwork; double * work; int * iwork; @@ -247,8 +250,8 @@ namespace CTF_int { //define the number of cols in the matrix to be the min of the number of observations and //the number we are willing to store (hist_size) int nrcol = std::min(nobs,(int64_t)hist_size); - //max of the number of local observations and nparam (will usually be the former) - int ncol = std::max(nrcol, nparam); + //max of the number of local observations and this->num_model_coefficients (will usually be the former) + int ncol = std::max(nrcol, this->num_model_coefficients); /* time_param * sort_mat = (time_param*)malloc(sizeof(time_param)*ncol); memcpy(sort_mat, time_param_mat, sizeof(time_param)*ncol); std::sort(sort_mat, sort_mat+ncol, &comp_time_param);*/ @@ -257,52 +260,52 @@ namespace CTF_int { //compute the total number of observations over all processors MPI_Allreduce(&nrcol, &tot_nrcol, 1, MPI_INT, MPI_SUM, cm); - //if there has been more than 16*nparam observations per processor, tune the model - if (tot_nrcol >= 16.*np*nparam){ + //if there has been more than 16*this->num_model_coefficients observations per processor, tune the model + if (tot_nrcol >= 16.*np*this->num_model_coefficients){ is_tuned = true; - //add nparam to ncol to include regularization, don't do so if the number of local + //add this->num_model_coefficients to ncol to include regularization, don't do so if the number of local //observatins is less than the number of params, as in this case, the processor will //not do any local tuning - if (nrcol >= nparam) ncol += nparam; + if (nrcol >= this->num_model_coefficients) ncol += this->num_model_coefficients; - double * R = (double*)malloc(sizeof(double)*nparam*nparam); + double * R = (double*)malloc(sizeof(double)*this->num_model_coefficients*this->num_model_coefficients); double * b = (double*)malloc(sizeof(double)*ncol); - //if number of local observations less than than nparam don't do local QR - if (nrcol < nparam){ - std::fill(R, R+nparam*nparam, 0.0); + //if number of local observations less than than this->num_model_coefficients don't do local QR + if (nrcol < this->num_model_coefficients){ + std::fill(R, R+this->num_model_coefficients*this->num_model_coefficients, 0.0); std::fill(b, b+ncol, 0.0); //regularization done on every processor /* if (rk == 0){ - lda_cpy(sizeof(double), 1, nparam, 1, nparam, (char const*)regularization, (char*)R); + lda_cpy(sizeof(double), 1, this->num_model_coefficients, 1, this->num_model_coefficients, (char const*)regularization, (char*)R); }*/ } else { //define tall-skinny matrix A that is almost the transpose of time_param, but excludes the first row of time_param (that has execution times that we will put into b - double * A = (double*)malloc(sizeof(double)*nparam*ncol); + double * A = (double*)malloc(sizeof(double)*this->num_model_coefficients*ncol); int i_st = 0; //figure out the maximum execution time any observation recorded // double max_time = 0.0; - // for (int i=0; inum_model_coefficients; i++){ // max_time = std::max(time_param_mat[i*mat_lda],max_time); // } - /*for (int i=0; inum_model_coefficients; i++){ + R[this->num_model_coefficients*i+i] = REG_LAMBDA; }*/ // do regularization if (true){ //rk == 0){ -// lda_cpy(sizeof(double), 1, nparam, 1, ncol, (char const*)regularization, (char*)A); +// lda_cpy(sizeof(double), 1, this->num_model_coefficients, 1, ncol, (char const*)regularization, (char*)A); //regularization done on every processor // parameter observs. coeffs. times (sec) - //matrix Ax~=b has the form, e.g. nparam=2 [ REG_LAMBDA 0 ] [ x_1 ] = [ 0 ] + //matrix Ax~=b has the form, e.g. nparam=1 [ REG_LAMBDA 0 ] [ x_1 ] = [ 0 ] // [ 0 REG_LAMBDA ] [ x_2 ] [ 0 ] // [ obs1p1 obs1p2 ] [ obs1t ] // obsxpy is the yth parameter as observed [ obs2p1 obs2p2 ] [ obs2t ] // in observation x [ ... ... ] [ ... ] // obsxt is the exe time of observation x - for (int i=0; inum_model_coefficients; i++){ b[i] = 0.0; - for (int j=0; jnum_model_coefficients; j++){ if (i==j){ if (coeff_guess[i] != 0.0){ A[ncol*j+i] = std::min(REG_LAMBDA,(avg_tot_time/coeff_guess[i])/1000.); @@ -312,7 +315,7 @@ namespace CTF_int { } else A[ncol*j+i] = 0.0; } } - i_st = nparam; + i_st = this->num_model_coefficients; } //find the max execution time over all processors // MPI_Allreduce(MPI_IN_PLACE, &max_time, 1, MPI_DOUBLE, MPI_MAX, cm); @@ -326,7 +329,7 @@ namespace CTF_int { if (0){ //if (time_param_mat[(i-i_st)*mat_lda] > max_time/3.){ b[i] = 0.0; - for (int j=0; jnum_model_coefficients; j++){ A[i+j*ncol] = 0.0; } } else { @@ -336,13 +339,13 @@ namespace CTF_int { //double rt_chnks = std::sqrt(b[i] / chunk); //double sfactor = rt_chnks/b[i]; //b[i] = rt_chnks; - for (int j=0; jnum_model_coefficients; j++){ A[i+j*ncol] = /*sfactor**/time_param_mat[(i-i_st)*mat_lda+j+1]; } } } /*for (int i=0; inum_model_coefficients; j++){ printf("%+1.3e ", A[i+j*ncol]); } printf (" | %+1.3e\n",b[i]); @@ -350,23 +353,23 @@ namespace CTF_int { //sequential code for fitting Ax=b (NOT USED, only works if running with 1 processor) if (false && np == 1){ - cdgelsd(ncol, nparam, 1, A, ncol, b, ncol, S, -1, &rank, &dlwork, -1, &liwork, &info); + cdgelsd(ncol, this->num_model_coefficients, 1, A, ncol, b, ncol, S, -1, &rank, &dlwork, -1, &liwork, &info); assert(info == 0); lwork = (int)dlwork; work = (double*)malloc(sizeof(double)*lwork); iwork = (int*)malloc(sizeof(int)*liwork); std::fill(iwork, iwork+liwork, 0); - cdgelsd(ncol, nparam, 1, A, ncol, b, ncol, S, -1, &rank, work, lwork, iwork, &info); + cdgelsd(ncol, this->num_model_coefficients, 1, A, ncol, b, ncol, S, -1, &rank, work, lwork, iwork, &info); //cdgeqrf( assert(info == 0); free(work); free(iwork); free(A); - memcpy(coeff_guess, b, nparam*sizeof(double)); + memcpy(coeff_guess, b, this->num_model_coefficients*sizeof(double)); /*print(); double max_resd_sq = 0.0; - for (int i=0; inum_model_coefficients; i++){ + max_resd_sq = std::max(max_resd_sq, b[this->num_model_coefficients+i]); } printf("%s max residual sq is %lf\n",name,max_resd_sq); double max_err = 0.0; @@ -379,27 +382,27 @@ namespace CTF_int { } //otherwise on the ith processor compute Q_iR_i=A_i and y_i=Q_i^Tb_i - double * tau = (double*)malloc(sizeof(double)*nparam); + double * tau = (double*)malloc(sizeof(double)*this->num_model_coefficients); int lwork; int info; double dlwork; - cdgeqrf(ncol, nparam, A, ncol, tau, &dlwork, -1, &info); + cdgeqrf(ncol, this->num_model_coefficients, A, ncol, tau, &dlwork, -1, &info); lwork = (int)dlwork; double * work = (double*)malloc(sizeof(double)*lwork); - cdgeqrf(ncol, nparam, A, ncol, tau, work, lwork, &info); - lda_cpy(sizeof(double), nparam, nparam, ncol, nparam, (const char *)A, (char*)R); - for (int i=0; inum_model_coefficients, A, ncol, tau, work, lwork, &info); + lda_cpy(sizeof(double), this->num_model_coefficients, this->num_model_coefficients, ncol, this->num_model_coefficients, (const char *)A, (char*)R); + for (int i=0; inum_model_coefficients; i++){ + for (int j=i+1; jnum_model_coefficients; j++){ + R[i*this->num_model_coefficients+j] = 0.0; } } //query how much space dormqr which computes Q_i^Tb_i needs - cdormqr('L', 'T', ncol, 1, nparam, A, ncol, tau, b, ncol, &dlwork, -1, &info); + cdormqr('L', 'T', ncol, 1, this->num_model_coefficients, A, ncol, tau, b, ncol, &dlwork, -1, &info); lwork = (int)dlwork; free(work); work = (double*)malloc(sizeof(double)*lwork); //actually run dormqr which computes Q_i^Tb_i needs - cdormqr('L', 'T', ncol, 1, nparam, A, ncol, tau, b, ncol, work, lwork, &info); + cdormqr('L', 'T', ncol, 1, this->num_model_coefficients, A, ncol, tau, b, ncol, work, lwork, &info); free(work); free(tau); free(A); @@ -411,26 +414,26 @@ namespace CTF_int { //FIXME: can be smarter but not clear if necessary if (rk < sub_np){ //all_R will have the Rs from each processor vertically stacked as [R_1^T .. R_32^T]^T - double * all_R = (double*)malloc(sizeof(double)*nparam*nparam*sub_np); + double * all_R = (double*)malloc(sizeof(double)*this->num_model_coefficients*this->num_model_coefficients*sub_np); //all_b will have the bs from each processor vertically stacked as [b_1^T .. b_32^T]^T - double * all_b = (double*)malloc(sizeof(double)*nparam*sub_np); + double * all_b = (double*)malloc(sizeof(double)*this->num_model_coefficients*sub_np); //gather all Rs from all the processors - MPI_Allgather(R, nparam*nparam, MPI_DOUBLE, all_R, nparam*nparam, MPI_DOUBLE, sub_comm); - double * Rs = (double*)malloc(sizeof(double)*nparam*nparam*sub_np); + MPI_Allgather(R, this->num_model_coefficients*this->num_model_coefficients, MPI_DOUBLE, all_R, this->num_model_coefficients*this->num_model_coefficients, MPI_DOUBLE, sub_comm); + double * Rs = (double*)malloc(sizeof(double)*this->num_model_coefficients*this->num_model_coefficients*sub_np); for (int i=0; inum_model_coefficients, this->num_model_coefficients, this->num_model_coefficients, sub_np*this->num_model_coefficients, (const char *)(all_R+i*this->num_model_coefficients*this->num_model_coefficients), (char*)(Rs+i*this->num_model_coefficients)); } //gather all bs from all the processors - MPI_Allgather(b, nparam, MPI_DOUBLE, all_b, nparam, MPI_DOUBLE, sub_comm); + MPI_Allgather(b, this->num_model_coefficients, MPI_DOUBLE, all_b, this->num_model_coefficients, MPI_DOUBLE, sub_comm); free(b); free(all_R); free(R); - ncol = sub_np*nparam; + ncol = sub_np*this->num_model_coefficients; b = all_b; double * A = Rs; /* if (rk==0){ for (int r=0; rnum_model_coefficients; c++){ printf("A[%d, %d] = %lf, ", r,c,A[c*ncol+r]); } printf("b[%d] = %lf\n",r,b[r]); @@ -438,42 +441,42 @@ namespace CTF_int { }*/ //compute fit for a reduced system // parameter observs. coeffs. times (sec) - //matrix Ax~=b has the form, e.g. nparam=2 [ R_1 ] [ x_1 ] = [ y_1 ] + //matrix Ax~=b has the form, e.g. nparam=1 [ R_1 ] [ x_1 ] = [ y_1 ] // [ R_2 ] [ x_2 ] [ y_2 ] // [ ... ] [ ... ] // [ R_32 ] [ y_32 ] //note 32 is p if p < 32 - cdgelsd(ncol, nparam, 1, A, ncol, b, ncol, S, -1, &rank, &dlwork, -1, &liwork, &info); + cdgelsd(ncol, this->num_model_coefficients, 1, A, ncol, b, ncol, S, -1, &rank, &dlwork, -1, &liwork, &info); assert(info == 0); lwork = (int)dlwork; work = (double*)malloc(sizeof(double)*lwork); iwork = (int*)malloc(sizeof(int)*liwork); std::fill(iwork, iwork+liwork, 0); - cdgelsd(ncol, nparam, 1, A, ncol, b, ncol, S, -1, &rank, work, lwork, iwork, &info); + cdgelsd(ncol, this->num_model_coefficients, 1, A, ncol, b, ncol, S, -1, &rank, work, lwork, iwork, &info); //cdgeqrf( assert(info == 0); free(work); free(iwork); free(A); //double step = 1.; - //for (int ii=0; iinum_model_coefficients; ii++){ // if (b[ii] <= 0.){ // step = std::min(step, -.999*coeff_guess[ii]/(b[ii]-coeff_guess[ii])); // } //} //assert(step>=0.); //if (step == 1.) - // memcpy(coeff_guess, b, nparam*sizeof(double)); + // memcpy(coeff_guess, b, this->num_model_coefficients*sizeof(double)); //else { - // for (int ii=0; iinum_model_coefficients; ii++){ // coeff_guess[ii] = (1.-step)*coeff_guess[ii] + step*b[ii]; // } //} - memcpy(coeff_guess, b, nparam*sizeof(double)); + memcpy(coeff_guess, b, this->num_model_coefficients*sizeof(double)); /*print(); double max_resd_sq = 0.0; - for (int i=0; inum_model_coefficients; i++){ + max_resd_sq = std::max(max_resd_sq, b[this->num_model_coefficients+i]); } printf("%s max residual sq is %lf\n",name,max_resd_sq); double max_err = 0.0; @@ -485,8 +488,8 @@ namespace CTF_int { } MPI_Comm_free(&sub_comm); //broadcast new coefficient guess - MPI_Bcast(coeff_guess, nparam, MPI_DOUBLE, 0, cm); - /*for (int i=0; inum_model_coefficients, MPI_DOUBLE, 0, cm); + /*for (int i=0; inum_model_coefficients; i++){ regularization[i] = coeff_guess[i]*REG_LAMBDA; }*/ } @@ -540,9 +543,9 @@ namespace CTF_int { template double LinModel::est_time(double const * param){ - double d = 0.; + double d = coeff_guess[0]; for (int i=0; i::print(){ assert(name!=NULL); printf("double %s_init[] = {",name); - for (int i=0; inum_model_coefficients; i++){ if (i>0) printf(", "); printf("%1.4E", coeff_guess[i]); } @@ -583,12 +586,12 @@ namespace CTF_int { // Generate the new line in the file std::string new_coeff_str = model_name+" "; char buffer[64]; - for(int i =0; inum_model_coefficients; i++){ buffer[0] = '\0'; std::sprintf(buffer,"%1.4E", coeff_guess[i]); std::string s(buffer); new_coeff_str += s; - if (i != nparam - 1){ + if (i != this->num_model_coefficients - 1){ new_coeff_str += " "; } } @@ -662,7 +665,7 @@ namespace CTF_int { // Get the nparam coeffs // double coeff_from_file [nparam]; - for(int i=0; inum_model_coefficients; i++){ if(!std::getline(f,s,' ')){ right_num_coeff = false; break; @@ -690,7 +693,7 @@ namespace CTF_int { else if (!right_num_coeff){ std::cout<<"Error! Number of coefficients in file does not match with the model"<num_model_coefficients;i++){ coeff_guess[i] = 0.0; } } @@ -711,7 +714,7 @@ namespace CTF_int { if (my_rank == 0){ // Dump the model coeffs - for(int i=0; inum_model_coefficients; i++){ ofs<num_model_coefficients*(nparam+2)/6+nparam*this->num_model_coefficients/2+nparam]; cube_params(param, lparam, nparam); return lparam; }*/ diff --git a/src/shared/model.h b/src/shared/model.h index 2ff60885..865dec37 100644 --- a/src/shared/model.h +++ b/src/shared/model.h @@ -22,6 +22,10 @@ namespace CTF_int { virtual void load_coeff(std::string file_name){}; virtual void write_coeff(std::string file_name){}; virtual void dump_data(std::string path){}; + + protected: + /** \brief number of (learned) coefficients comprising the global model */ + int num_model_coefficients; }; void update_all_models(MPI_Comm cm); @@ -75,7 +79,7 @@ namespace CTF_int { /** * \brief constructor - * \param[in] init_guess array of size nparam consisting of initial model parameter guesses + * \param[in] init_guess array of size nparam+1 consisting of initial model parameter guesses * \param[in] name identifier * \param[in] hist_size number of times to keep in history */ @@ -153,7 +157,7 @@ namespace CTF_int { public: /** * \brief constructor - * \param[in] init_guess array of size nparam consisting of initial model parameter guesses + * \param[in] init_guess array of size nparam+1 consisting of initial model parameter guesses * \param[in] name identifier * \param[in] hist_size number of times to keep in history */ diff --git a/src/shared/offload.cu b/src/shared/offload.cu index 1b6a97e6..e0ea63b8 100644 --- a/src/shared/offload.cu +++ b/src/shared/offload.cu @@ -67,16 +67,16 @@ namespace CTF_int{ /*offload_tsr::~offload_tsr(){ }*/ - LinModel<2> upload_mdl(upload_mdl_init,"upload_mdl"); - LinModel<2> download_mdl(download_mdl_init,"download_mdl"); + LinModel<1> upload_mdl(upload_mdl_init,"upload_mdl"); + LinModel<1> download_mdl(download_mdl_init,"download_mdl"); double estimate_download_time(int64_t size){ - double ps[] = {1.0, (double)size}; + double ps[] = {(double)size}; return download_mdl.est_time(ps); } double estimate_upload_time(int64_t size){ - double ps[] = {1.0, (double)size}; + double ps[] = {(double)size}; return upload_mdl.est_time(ps); } @@ -136,7 +136,7 @@ namespace CTF_int{ cudaError_t err = cudaMemcpy(host_spr, dev_spr, nbytes, cudaMemcpyDeviceToHost); double exe_time = MPI_Wtime()-st_time; - double tps[] = {exe_time, 1.0, (double)nbytes}; + double tps[] = {exe_time, (double)nbytes}; download_mdl.observe(tps); TAU_FSTOP(cuda_download); assert(err == cudaSuccess); @@ -150,7 +150,7 @@ namespace CTF_int{ cudaMemcpyHostToDevice); double exe_time = MPI_Wtime()-st_time; - double tps[] = {exe_time, 1.0, (double)nbytes}; + double tps[] = {exe_time, (double)nbytes}; upload_mdl.observe(tps); TAU_FSTOP(cuda_upload); assert(err == cudaSuccess); diff --git a/src/tensor/algstrct.cxx b/src/tensor/algstrct.cxx index 855615e2..0855e060 100644 --- a/src/tensor/algstrct.cxx +++ b/src/tensor/algstrct.cxx @@ -9,9 +9,7 @@ using namespace std; namespace CTF_int { - LinModel<3> csrred_mdl(csrred_mdl_init,"csrred_mdl"); - LinModel<3> csrred_mdl_cst(csrred_mdl_cst_init,"csrred_mdl_cst"); - + LinModel<2> csrred_mdl(csrred_mdl_init,"csrred_mdl"); template struct CompPair{ @@ -519,7 +517,7 @@ namespace CTF_int { cdealloc(red_sum); cdealloc(cb_bufs); double t_end = MPI_Wtime() - t_st; - double tps[] = {t_end, 1.0, log2((double)p), (double)sz_A}; + double tps[] = {t_end, log2((double)p), (double)sz_A}; // note-quite-sure csrred_mdl.observe(tps); @@ -543,7 +541,7 @@ namespace CTF_int { double algstrct::estimate_csr_red_time(int64_t msg_sz, CommData const * cdt) const { - double ps[] = {1.0, log2((double)cdt->np), (double)msg_sz}; + double ps[] = {log2((double)cdt->np), (double)msg_sz}; return csrred_mdl.est_time(ps); } diff --git a/src/tensor/untyped_tensor.cxx b/src/tensor/untyped_tensor.cxx index 134b01fe..fb578b98 100644 --- a/src/tensor/untyped_tensor.cxx +++ b/src/tensor/untyped_tensor.cxx @@ -21,7 +21,7 @@ using namespace CTF; namespace CTF_int { - LinModel<3> spredist_mdl(spredist_mdl_init,"spredist_mdl"); + LinModel<2> spredist_mdl(spredist_mdl_init,"spredist_mdl"); double spredist_est_time(int64_t size, int np){ double ps[] = {1.0, (double)log2(np), (double)size*log2(np)}; return spredist_mdl.est_time(ps); @@ -2986,7 +2986,7 @@ namespace CTF_int { #ifdef TUNE // change-of-observe double nnz_frac_ = ((double)nnz_tot)/(old_dist.size*wrld->cdt.np); - double tps_[] = {0.0, 1.0, (double)log2(wrld->cdt.np), (double)std::max(old_dist.size, new_dist.size)*log2(wrld->cdt.np)*sr->el_size*nnz_frac_}; + double tps_[] = {0.0, (double)log2(wrld->cdt.np), (double)std::max(old_dist.size, new_dist.size)*log2(wrld->cdt.np)*sr->el_size*nnz_frac_}; if (!spredist_mdl.should_observe(tps_)) return SUCCESS; double st_time = MPI_Wtime(); @@ -3008,7 +3008,7 @@ namespace CTF_int { #ifdef TUNE double exe_time = MPI_Wtime()-st_time; double nnz_frac = ((double)nnz_tot)/(old_dist.size*wrld->cdt.np); - double tps[] = {exe_time, 1.0, (double)log2(wrld->cdt.np), (double)std::max(old_dist.size, new_dist.size)*log2(wrld->cdt.np)*sr->el_size*nnz_frac}; + double tps[] = {exe_time, (double)log2(wrld->cdt.np), (double)std::max(old_dist.size, new_dist.size)*log2(wrld->cdt.np)*sr->el_size*nnz_frac}; spredist_mdl.observe(tps); #endif } else { From 6d60db96f484037e3a2e451b413d9e79caa06911 Mon Sep 17 00:00:00 2001 From: Edward Hutter Date: Tue, 19 Sep 2023 10:31:11 -0500 Subject: [PATCH 2/2] Removed any lingering use of 1 as feature to model --- src/contraction/spctr_tsr.cxx | 8 ++++---- src/interface/common.cxx | 6 +++--- src/redistribution/dgtog_redist_ror.h | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/contraction/spctr_tsr.cxx b/src/contraction/spctr_tsr.cxx index 5ccde546..fa9015ed 100644 --- a/src/contraction/spctr_tsr.cxx +++ b/src/contraction/spctr_tsr.cxx @@ -378,7 +378,7 @@ namespace CTF_int { } - double tps_[] = {0.0, 1.0, (double)est_membw(nnz_frac_A, nnz_frac_B, nnz_frac_C), est_fp(nnz_frac_B, nnz_frac_B, nnz_frac_C)}; + double tps_[] = {0.0, (double)est_membw(nnz_frac_A, nnz_frac_B, nnz_frac_C), est_fp(nnz_frac_B, nnz_frac_B, nnz_frac_C)}; // Check if we need to execute this function for the sake of training bool bsr = true; switch (krnl_type){ @@ -550,7 +550,7 @@ namespace CTF_int { } double exe_time = MPI_Wtime() - st_time; - double tps[] = {exe_time, 1.0, (double)est_membw(nnz_frac_A, nnz_frac_B, nnz_frac_C), est_fp(nnz_frac_B, nnz_frac_B, nnz_frac_C)}; + double tps[] = {exe_time, (double)est_membw(nnz_frac_A, nnz_frac_B, nnz_frac_C), est_fp(nnz_frac_B, nnz_frac_B, nnz_frac_C)}; switch (krnl_type){ case 0: if (is_custom){ @@ -1084,7 +1084,7 @@ namespace CTF_int { pi.pin(nnz, order, lens, divisor, pi_new); double exe_time = MPI_Wtime()-st_time; - double tps[] = {exe_time, 1.0, (double)nnz}; + double tps[] = {exe_time, (double)nnz}; pin_keys_mdl.observe(tps); TAU_FSTOP(spctr_pin_keys); @@ -1109,7 +1109,7 @@ namespace CTF_int { } depin(sr_C, order, lens, divisor, nblk_C, virt_dim, phys_rank, new_C, new_nnz_C, size_blk_C, new_C, true); double exe_time = MPI_Wtime()-st_time; - double tps[] = {exe_time, 1.0, (double)nnz}; + double tps[] = {exe_time, (double)nnz}; pin_keys_mdl.observe(tps); break; } diff --git a/src/interface/common.cxx b/src/interface/common.cxx index a64e1e7e..17edf7cf 100644 --- a/src/interface/common.cxx +++ b/src/interface/common.cxx @@ -339,12 +339,12 @@ namespace CTF_int { } double CommData::estimate_bcast_time(int64_t msg_sz){ - double ps[] = {1.0, log2((double)np), (double)msg_sz}; + double ps[] = {log2((double)np), (double)msg_sz}; return bcast_mdl.est_time(ps); } double CommData::estimate_allred_time(int64_t msg_sz, MPI_Op op){ - double ps[] = {1.0, log2((double)np), (double)msg_sz*log2((double)(np))}; + double ps[] = {log2((double)np), (double)msg_sz*log2((double)(np))}; if (op >= MPI_MAX && op <= MPI_REPLACE) return allred_mdl.est_time(ps); else @@ -352,7 +352,7 @@ namespace CTF_int { } double CommData::estimate_red_time(int64_t msg_sz, MPI_Op op){ - double ps[] = {1.0, log2((double)np), (double)msg_sz*log2((double)(np))}; + double ps[] = {log2((double)np), (double)msg_sz*log2((double)(np))}; if (op >= MPI_MAX && op <= MPI_REPLACE) return red_mdl.est_time(ps); else diff --git a/src/redistribution/dgtog_redist_ror.h b/src/redistribution/dgtog_redist_ror.h index 031b6885..d3d4a298 100644 --- a/src/redistribution/dgtog_redist_ror.h +++ b/src/redistribution/dgtog_redist_ror.h @@ -712,7 +712,7 @@ void dgtog_reshuffle(int const * sym, MPI_Barrier(ord_glb_comm.cm); #endif double exe_time = MPI_Wtime()-st_time; - double tps[] = {exe_time, 1.0, (double)log2(ord_glb_comm.np), (double)std::max(old_dist.size, new_dist.size)*log2(ord_glb_comm.np)*sr->el_size}; + double tps[] = {exe_time, (double)log2(ord_glb_comm.np), (double)std::max(old_dist.size, new_dist.size)*log2(ord_glb_comm.np)*sr->el_size}; // double-check dgtog_res_mdl.observe(tps);