From fabdd13cefef65d6761ff3e9b8bbf93e42b14a15 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Mon, 12 Aug 2024 18:03:56 -0700 Subject: [PATCH] client: address issue with max concurrent and work fetch Max concurrent is a limit on jobs, not processor instances. The work fetch logic made the erroneous implicit assumption that all jobs use 1 CPU. So e.g. if project has max concurrent 4, and the client has two 2-CPU jobs, it will think (if work buf is zero) that there's no point in fetching more work. But in fact the project could use 8 CPUs, so 4 are idle. Fix: if a project has MC constraints, then for each resource compute 'mc_max_could_use': the max # of instances the project could use, given its MC constraints. Use this to compute the project's shortfall, and hence to decide whether to fetch work from it. Note: the way mc_max_could_use is computed is crude; it takes the max over all apps, when it's possible that only one of them has a MC constraint. This could result in limited over-fetching, but that's preferable to under-fetching and starvation. Sim: show app name in timeline --- client/client_state.cpp | 2 +- client/makefile_sim | 3 +++ client/project.h | 4 ++-- client/rr_sim.cpp | 14 +++++--------- client/sim.cpp | 11 ++++++----- client/work_fetch.cpp | 35 ++++++++++++++++++++++++++++------- client/work_fetch.h | 15 ++++++++++----- lib/cc_config.h | 5 +++-- 8 files changed, 58 insertions(+), 31 deletions(-) diff --git a/client/client_state.cpp b/client/client_state.cpp index d371aa7741b..0cf6dc7e1d5 100644 --- a/client/client_state.cpp +++ b/client/client_state.cpp @@ -2118,7 +2118,7 @@ int CLIENT_STATE::reset_project(PROJECT* project, bool detaching) { project->min_rpc_time = 0; project->pwf.reset(project); for (int j=0; jrsc_pwf[j].reset(); + project->rsc_pwf[j].reset(j); } write_state_file(); return 0; diff --git a/client/makefile_sim b/client/makefile_sim index b7ad1fa60aa..7716917d749 100644 --- a/client/makefile_sim +++ b/client/makefile_sim @@ -1,5 +1,8 @@ # makefile for client simulator # Do "make_clean" in client/, lib/, and sched/ first +# +# this doesn't have .h dependencies; if you change something, +# do make clean and make CXXFLAGS = -g -DSIM -Wall \ -I ../lib \ diff --git a/client/project.h b/client/project.h index 85ff4ba1ff7..569821b7d46 100644 --- a/client/project.h +++ b/client/project.h @@ -276,9 +276,9 @@ struct PROJECT : PROJ_AM { // RSC_PROJECT_WORK_FETCH rsc_pwf[MAX_RSC]; PROJECT_WORK_FETCH pwf; - inline void reset() { + inline void work_fetch_reset() { for (int i=0; irsc_pwf[0].sim_nused > p->rsc_pwf[0].max_nused) { - p->rsc_pwf[0].max_nused = p->rsc_pwf[0].sim_nused; - } - if (rt && p->rsc_pwf[rt].sim_nused > p->rsc_pwf[rt].max_nused) { - p->rsc_pwf[rt].max_nused = p->rsc_pwf[rt].sim_nused; - } } } @@ -438,9 +432,11 @@ static void mc_update_stats(double sim_now, double dt, double buf_end) { if (!p->app_configs.project_has_mc) continue; for (int rt=0; rtrsc_pwf[rt]; - RSC_WORK_FETCH& rwf = rsc_work_fetch[rt]; - double x = rsc_pwf.max_nused - rsc_pwf.sim_nused; - x = std::min(x, rwf.ninstances - rwf.sim_nused); + + // x is the number of instances this project isn't using but could + // (given MC constraints) + // + double x = rsc_pwf.mc_max_could_use - rsc_pwf.sim_nused; if (x > 1e-6 && sim_now < buf_end) { double dt2; if (sim_now + dt > buf_end) { diff --git a/client/sim.cpp b/client/sim.cpp index be51c4db90f..7f8d5a528d6 100644 --- a/client/sim.cpp +++ b/client/sim.cpp @@ -847,8 +847,9 @@ void show_resource(int rsc_type) { bool found = false; for (i=0; iresult; if (atp->task_state() != PROCESS_EXECUTING) continue; + RESULT* rp = atp->result; + PROJECT* p = rp->project; double ninst=0; if (rsc_type) { if (rp->avp->gpu_usage.rsc_type != rsc_type) continue; @@ -857,12 +858,11 @@ void show_resource(int rsc_type) { ninst = rp->avp->avg_ncpus; } - PROJECT* p = rp->project; if (!found) { found = true; fprintf(html_out, "\n" - "%s\n", + "%s\n", rsc_type?"":"" ); } @@ -871,8 +871,9 @@ void show_resource(int rsc_type) { } else { safe_strcpy(buf, ""); } - fprintf(html_out, "%s\n", + fprintf(html_out, "%s\n", ninst, + rp->wup->app->name, colors[p->proj_index%NCOLORS], rp->edf_scheduled?"*":"", rp->name, @@ -1340,7 +1341,7 @@ void clear_backoff() { for (i=0; irsc_pwf[j].reset(); + p->rsc_pwf[j].reset(j); } p->min_rpc_time = 0; } diff --git a/client/work_fetch.cpp b/client/work_fetch.cpp index ac83fca871b..09824893f42 100644 --- a/client/work_fetch.cpp +++ b/client/work_fetch.cpp @@ -68,6 +68,7 @@ inline bool has_coproc_app(PROJECT* p, int rsc_type) { /////////////// RSC_PROJECT_WORK_FETCH /////////////// void RSC_PROJECT_WORK_FETCH::rr_init(PROJECT *p) { + unsigned int i; fetchable_share = 0; n_runnable_jobs = 0; sim_nused = 0; @@ -75,7 +76,29 @@ void RSC_PROJECT_WORK_FETCH::rr_init(PROJECT *p) { deadlines_missed = 0; mc_shortfall = 0; last_mc_limit_reltime = 0; - max_nused = p->app_configs.project_min_mc; + if (p->app_configs.project_has_mc) { + // compute x = max usage over this resource over P's app versions + double x = 0; + for (i=0; iproject != p) continue; + if (rsc_type && (avp->gpu_usage.rsc_type == rsc_type)) { + if (avp->gpu_usage.usage > x) x = avp->gpu_usage.usage; + } else { + if (avp->avg_ncpus > x) x = avp->avg_ncpus; + } + } + + // max instances this project could use is (approximately) + // its smallest max concurrent limit times x + // This doesn't take into account e.g. that the MC limit + // could be from a different app than the one that determined x + // + mc_max_could_use = std::min( + p->app_configs.project_min_mc*x, + (double)(rsc_work_fetch[rsc_type].ninstances) + ); + } } void RSC_PROJECT_WORK_FETCH::resource_backoff(PROJECT* p, const char* name) { @@ -98,9 +121,7 @@ void RSC_PROJECT_WORK_FETCH::resource_backoff(PROJECT* p, const char* name) { // check for backoff must go last, so that if that's the reason // we know that there are no other reasons (for piggyback) // -RSC_REASON RSC_PROJECT_WORK_FETCH::compute_rsc_project_reason( - PROJECT *p, int rsc_type -) { +RSC_REASON RSC_PROJECT_WORK_FETCH::compute_rsc_project_reason(PROJECT *p) { RSC_WORK_FETCH& rwf = rsc_work_fetch[rsc_type]; // see whether work fetch for this resource is banned // by prefs, config, project, or acct mgr @@ -373,7 +394,7 @@ void RSC_WORK_FETCH::clear_request() { void PROJECT_WORK_FETCH::reset(PROJECT* p) { for (int i=0; irsc_pwf[i].reset(); + p->rsc_pwf[i].reset(i); } } @@ -696,7 +717,7 @@ void WORK_FETCH::setup() { p->pwf.project_reason = compute_project_reason(p); for (int j=0; jrsc_pwf[j]; - rpwf.rsc_project_reason = rpwf.compute_rsc_project_reason(p, j); + rpwf.rsc_project_reason = rpwf.compute_rsc_project_reason(p); } } for (int j=0; j app_version_configs; int project_max_concurrent; bool project_has_mc; - // have app- or project-level max concurrent restriction + // the project has app- or project-level restriction + // on # of concurrent jobs int project_min_mc; - // the min of these restrictions + // if true, the min of these restrictions bool report_results_immediately; int parse(XML_PARSER&, MSG_VEC&, LOG_FLAGS&);
#devsJob name (* = high priority)GFLOPs left
#devsAppJob name (* = high priority)GFLOPs left
GPU
%.2f%s%s%.0f
%.2f%s%s%s%.0f