From 8020fde86a6edde08a60734b2869e6d5584f4cb0 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Mon, 28 Oct 2024 19:34:30 -0700 Subject: [PATCH 1/5] client/API/docker_wrapper: get CPU and mem usage of Docker apps Mem usage (WSS): The easiest way to get the WSS of a Docker contaier is to ask Docker using the "docker stats" command. So I have docker_wrapper do this periodically (10 sec... it's a bit slow). But how to get this back to the client? Currently there's no provision for an app to reports its own WSS. So I added one, by adding an optional field to the app status messages sent from app to client in shared mem. If this is present, the client uses it instead of procinfo. CPU time: "docker stats" reports CPU fraction (averaged over what period?) We multiply that by the stats poll. Not exactly the same as CPU time, but close enough. --- api/boinc_api.cpp | 9 ++- api/boinc_api.h | 3 +- client/app.cpp | 3 + client/app.h | 14 +++- client/app_control.cpp | 31 ++++---- samples/docker_wrapper/docker_wrapper.cpp | 93 +++++++++++++++++++---- samples/vboxwrapper/vbox_common.cpp | 3 +- samples/vboxwrapper/vboxwrapper.cpp | 6 +- 8 files changed, 124 insertions(+), 38 deletions(-) diff --git a/api/boinc_api.cpp b/api/boinc_api.cpp index be7b345d3e9..6b1709f6d09 100644 --- a/api/boinc_api.cpp +++ b/api/boinc_api.cpp @@ -1052,7 +1052,8 @@ int boinc_report_app_status_aux( double _fraction_done, int other_pid, double _bytes_sent, - double _bytes_received + double _bytes_received, + double wss ) { char msg_buf[MSG_CHANNEL_SIZE], buf[1024]; if (standalone) return 0; @@ -1081,6 +1082,10 @@ int boinc_report_app_status_aux( sprintf(buf, "%d\n", ac_state); strlcat(msg_buf, buf, sizeof(msg_buf)); } + if (wss) { + sprintf(buf, "%f\n", wss); + strlcat(msg_buf, buf, sizeof(msg_buf)); + } #ifdef MSGS_FROM_FILE if (fout) { fputs(msg_buf, fout); @@ -1100,7 +1105,7 @@ int boinc_report_app_status( double _fraction_done ){ return boinc_report_app_status_aux( - cpu_time, checkpoint_cpu_time, _fraction_done, 0, 0, 0 + cpu_time, checkpoint_cpu_time, _fraction_done, 0, 0, 0, 0 ); } diff --git a/api/boinc_api.h b/api/boinc_api.h index acb65bb8452..758cf7e55b9 100644 --- a/api/boinc_api.h +++ b/api/boinc_api.h @@ -138,7 +138,8 @@ extern int boinc_upload_status(std::string& name); extern char* boinc_msg_prefix(char*, int); extern int boinc_report_app_status_aux( double cpu_time, double checkpoint_cpu_time, double _fraction_done, - int other_pid, double bytes_sent, double bytes_received + int other_pid, double bytes_sent, double bytes_received, + double wss ); extern int boinc_temporary_exit( int delay, const char* reason=NULL, bool is_notice=false diff --git a/client/app.cpp b/client/app.cpp index eda742eb22b..8fbfba2c773 100644 --- a/client/app.cpp +++ b/client/app.cpp @@ -110,6 +110,7 @@ ACTIVE_TASK::ACTIVE_TASK() { peak_disk_usage = 0; once_ran_edf = false; + wss_from_app = 0; fraction_done = 0; fraction_done_elapsed_time = 0; first_fraction_done = 0; @@ -420,6 +421,8 @@ void ACTIVE_TASK_SET::get_memory_usage() { // at least on Windows. Use the VM size instead. // pi.working_set_size_smoothed = atp->wup->rsc_memory_bound; + } else if (atp->wss_from_app > 0) { + pi.working_set_size_smoothed = .5*(pi.working_set_size_smoothed + atp->wss_from_app); } else { pi.working_set_size_smoothed = .5*(pi.working_set_size_smoothed + pi.working_set_size); } diff --git a/client/app.h b/client/app.h index d7b306bea2f..72534512ef8 100644 --- a/client/app.h +++ b/client/app.h @@ -53,12 +53,14 @@ typedef int PROCESS_ID; // Represents a job in progress. -// When an active task is created, it is assigned a "slot" +// When a job is started, it is assigned a "slot" // which determines the directory it runs in. -// This doesn't change over the life of the active task; -// thus the task can use the slot directory for temp files +// This doesn't change over the life of the job; +// so it can use the slot directory for temp files // that BOINC doesn't know about. +// If you add anything, initialize it in the constructor +// struct ACTIVE_TASK { #ifdef _WIN32 HANDLE process_handle, shm_handle; @@ -100,8 +102,12 @@ struct ACTIVE_TASK { // most recent CPU time reported by app bool once_ran_edf; - // END OF ITEMS SAVED IN STATE FILE + // END OF ITEMS SAVED IN STATE FILES + double wss_from_app; + // work set size reported by the app + // (e.g. docker_wrapper does this). + // If nonzero, use this instead of procinfo data double fraction_done; // App's estimate of how much of the work unit is done. // Passed from the application via an API call; diff --git a/client/app_control.cpp b/client/app_control.cpp index b03a9875db1..14c628461cb 100644 --- a/client/app_control.cpp +++ b/client/app_control.cpp @@ -1439,8 +1439,23 @@ bool ACTIVE_TASK::get_app_status_msg() { } } } - parse_double(msg_buf, "", current_cpu_time); - parse_double(msg_buf, "", checkpoint_cpu_time); + if (parse_double(msg_buf, "", current_cpu_time)) { + if (current_cpu_time < 0) { + msg_printf(result->project, MSG_INFO, + "app reporting negative CPU: %f", current_cpu_time + ); + current_cpu_time = 0; + } + } + if (parse_double(msg_buf, "", checkpoint_cpu_time)) { + if (checkpoint_cpu_time < 0) { + msg_printf(result->project, MSG_INFO, + "app reporting negative checkpoint CPU: %f", checkpoint_cpu_time + ); + checkpoint_cpu_time = 0; + } + } + parse_double(msg_buf, "", wss_from_app); parse_double(msg_buf, "", result->fpops_per_cpu_sec); parse_double(msg_buf, "", result->fpops_cumulative); parse_double(msg_buf, "", result->intops_per_cpu_sec); @@ -1470,18 +1485,6 @@ bool ACTIVE_TASK::get_app_status_msg() { if (parse_int(msg_buf, "", i)) { sporadic_ac_state = (SPORADIC_AC_STATE)i; } - if (current_cpu_time < 0) { - msg_printf(result->project, MSG_INFO, - "app reporting negative CPU: %f", current_cpu_time - ); - current_cpu_time = 0; - } - if (checkpoint_cpu_time < 0) { - msg_printf(result->project, MSG_INFO, - "app reporting negative checkpoint CPU: %f", checkpoint_cpu_time - ); - checkpoint_cpu_time = 0; - } return true; } diff --git a/samples/docker_wrapper/docker_wrapper.cpp b/samples/docker_wrapper/docker_wrapper.cpp index 70d9df0039b..09458bc445c 100644 --- a/samples/docker_wrapper/docker_wrapper.cpp +++ b/samples/docker_wrapper/docker_wrapper.cpp @@ -79,14 +79,16 @@ using std::string; using std::vector; #define POLL_PERIOD 1.0 +#define STATUS_PERIOD 10 + // reports status this often enum JOB_STATUS {JOB_IN_PROGRESS, JOB_SUCCESS, JOB_FAIL}; struct RSC_USAGE { - double cpu_time; + double cpu_frac; double wss; void clear() { - cpu_time = 0; + cpu_frac = 0; wss = 0; } }; @@ -442,7 +444,10 @@ void poll_client_msgs() { } } -JOB_STATUS poll_app(RSC_USAGE &ru) { +// check whether job has exited +// Note: on both Podman and Docker this takes significant CPU time +// (like .03 sec) so do it infrequently (like 5 sec) +JOB_STATUS poll_app() { char cmd[1024]; vector out; int retval; @@ -461,6 +466,41 @@ JOB_STATUS poll_app(RSC_USAGE &ru) { return JOB_FAIL; } +// get CPU and mem usage +// This is also surprisingly slow +int get_stats(RSC_USAGE &ru) { + char cmd[1024]; + vector out; + int retval; + unsigned int n; + + sprintf(cmd, + "%s stats --no-stream --format \"{{.CPUPerc}} {{.MemUsage}}\" %s", + cli_prog, container_name + ); + retval = run_docker_command(cmd, out); + if (retval) return -1; + n = out.size(); + if (n == 0) return -1; + const char *buf = out[n-1].c_str(); + // output is like + // 0.00% 420KiB / 503.8GiB + double cpu_pct, mem; + char mem_unit; + n = sscanf(buf, "%lf%% %lf%c", &cpu_pct, &mem, &mem_unit); + if (n != 3) return -1; + switch (mem_unit) { + case 'G': mem *= GIGA; break; + case 'M': mem *= MEGA; break; + case 'K': mem *= KILO; break; + case 'B': break; + default: return -1; + } + ru.cpu_frac = cpu_pct/100.; + ru.wss = mem; + return 0; +} + #ifdef _WIN32 // find a WSL distro with Docker and set up a command link to it // @@ -543,6 +583,14 @@ int main(int argc, char** argv) { } if (verbose) config.print(); + if (sporadic) { + retval = boinc_sporadic_dir("."); + if (retval) { + fprintf(stderr, "can't create sporadic files\n"); + boinc_finish(retval); + } + } + #ifdef _WIN32 retval = wsl_init(); if (retval) { @@ -578,18 +626,35 @@ int main(int argc, char** argv) { boinc_finish(1); } running = true; - while (1) { + double cpu_time = 0; + for (int i=0; ; i++) { poll_client_msgs(); - switch(poll_app(ru)) { - case JOB_FAIL: - cleanup(); - boinc_finish(1); - break; - case JOB_SUCCESS: - copy_files_from_container(); - cleanup(); - boinc_finish(0); - break; + if (i%STATUS_PERIOD == 0) { + switch(poll_app()) { + case JOB_FAIL: + cleanup(); + boinc_finish(1); + break; + case JOB_SUCCESS: + copy_files_from_container(); + cleanup(); + boinc_finish(0); + break; + default: + break; + } + retval = get_stats(ru); + if (!retval) { + cpu_time += STATUS_PERIOD*ru.cpu_frac; + boinc_report_app_status_aux( + cpu_time, + 0, // checkpoint CPU time + 0, // frac done + 0, // other PID + 0,0, // bytes send/received + ru.wss + ); + } } boinc_sleep(POLL_PERIOD); } diff --git a/samples/vboxwrapper/vbox_common.cpp b/samples/vboxwrapper/vbox_common.cpp index 42664be0208..cd42a816ef1 100644 --- a/samples/vboxwrapper/vbox_common.cpp +++ b/samples/vboxwrapper/vbox_common.cpp @@ -316,7 +316,8 @@ void VBOX_VM::report_clean( fraction_done, vm_pid, bytes_sent, - bytes_received + bytes_received, + 0 ); } diff --git a/samples/vboxwrapper/vboxwrapper.cpp b/samples/vboxwrapper/vboxwrapper.cpp index 9f15ee68aef..75eec21aa66 100644 --- a/samples/vboxwrapper/vboxwrapper.cpp +++ b/samples/vboxwrapper/vboxwrapper.cpp @@ -889,7 +889,8 @@ int main(int argc, char** argv) { fraction_done, pVM->vm_pid, bytes_sent, - bytes_received + bytes_received, + 0 ); // Wait for up to 5 minutes for the VM to switch states. @@ -1373,7 +1374,8 @@ int main(int argc, char** argv) { fraction_done, pVM->vm_pid, bytes_sent, - bytes_received + bytes_received, + 0 ); if (!retval) { From 086d52c41e13e9022aa184de3e71036bcd3c6229 Mon Sep 17 00:00:00 2001 From: davidpanderson Date: Tue, 29 Oct 2024 17:34:38 -0700 Subject: [PATCH 2/5] docker wrapper: look for exact container name, not substring --- client/app_test.cpp | 33 ++++++++++++++++--- samples/docker_wrapper/docker_wrapper.cpp | 10 +++--- .../docker_wrapper/test_copy/{in => infile} | 0 3 files changed, 35 insertions(+), 8 deletions(-) rename samples/docker_wrapper/test_copy/{in => infile} (100%) diff --git a/client/app_test.cpp b/client/app_test.cpp index 62b9abebd31..dbb9eb008ae 100644 --- a/client/app_test.cpp +++ b/client/app_test.cpp @@ -26,7 +26,7 @@ // input/output files, attributes, etc. // It currently has several test cases, selected with #ifdef // - build the BOINC client with these changes -// - make a BOINC data directory, say 'test' +// - Linux: make a BOINC data directory, say 'test' // (or you can use an existing BOINC data directory, // in which case the client will also run existing jobs) // - make a directory test/slots/app_test @@ -50,7 +50,7 @@ // define exactly one -#define APP_NONE +//#define APP_NONE //#define APP_WSL_WRAPPER // type physical logical copy? // app wsl_wrapper.exe wsl_wrapper.exe @@ -58,7 +58,7 @@ // app main main yes // input infile in // output outfile out -//#define APP_DOCKER_WRAPPER_COPY +#define APP_DOCKER_WRAPPER_COPY // type physical logical copy? // app worker worker yes // app job_copy.toml job_copy.toml yes @@ -224,6 +224,20 @@ void CLIENT_STATE::app_test_init() { *make_file(app->project, "Dockerfile_copy", "Dockerfile", INPUT_FILE, true) ); #endif +#ifdef APP_DOCKER_WRAPPER_MOUNT + av->app_files.push_back( + *make_file(app->project, "docker_wrapper.exe", NULL, MAIN_PROG, false) + ); + av->app_files.push_back( + *make_file(app->project, "worker", NULL, INPUT_FILE, false) + ); + av->app_files.push_back( + *make_file(app->project, "job_copy.toml", "job.toml", INPUT_FILE, true) + ); + av->app_files.push_back( + *make_file(app->project, "Dockerfile_copy", "Dockerfile", INPUT_FILE, true) + ); +#endif // can put other stuff here like #if 0 @@ -243,11 +257,17 @@ void CLIENT_STATE::app_test_init() { ); #endif #ifdef APP_DOCKER_WRAPPER_COPY + wu->command_line = "--verbose"; wu->input_files.push_back( *make_file(proj, "infile", "in", INPUT_FILE, true) ); #endif - +#ifdef APP_DOCKER_WRAPPER_MOUNT + wu->command_line = "--verbose"; + wu->input_files.push_back( + *make_file(proj, "infile", "in", INPUT_FILE, false) + ); +#endif RESULT *result = make_result(av, wu); ////////////// OUTPUT FILES ///////////////// @@ -262,6 +282,11 @@ void CLIENT_STATE::app_test_init() { *make_file(proj, "outfile", "out", OUTPUT_FILE, true) ); #endif +#ifdef APP_DOCKER_WRAPPER_MOUNT + result->output_files.push_back( + *make_file(proj, "outfile", "out", OUTPUT_FILE, false) + ); +#endif // tell the client not to get work or run benchmarks // diff --git a/samples/docker_wrapper/docker_wrapper.cpp b/samples/docker_wrapper/docker_wrapper.cpp index 09458bc445c..144520ff493 100644 --- a/samples/docker_wrapper/docker_wrapper.cpp +++ b/samples/docker_wrapper/docker_wrapper.cpp @@ -54,7 +54,7 @@ // image name: boinc // container name: boinc // slot dir: . -// project dir: project/ +// project dir (mount mode): project/ // enable standalone tests on Win // @@ -129,7 +129,7 @@ char container_name[512]; APP_INIT_DATA aid; CONFIG config; bool running; -bool verbose = true; +bool verbose = false; const char* config_file = "job.toml"; const char* dockerfile = "Dockerfile"; const char* cli_prog; @@ -250,8 +250,9 @@ int image_exists(bool &exists) { sprintf(cmd, "%s images", cli_prog); int retval = run_docker_command(cmd, out); if (retval) return retval; + string image_name_space = image_name + string(" "); for (string line: out) { - if (line.find(image_name) != string::npos) { + if (line.find(image_name_space) != string::npos) { exists = true; return 0; } @@ -472,7 +473,7 @@ int get_stats(RSC_USAGE &ru) { char cmd[1024]; vector out; int retval; - unsigned int n; + size_t n; sprintf(cmd, "%s stats --no-stream --format \"{{.CPUPerc}} {{.MemUsage}}\" %s", @@ -568,6 +569,7 @@ int main(int argc, char** argv) { boinc_init_options(&options); if (boinc_is_standalone()) { + verbose = true; strcpy(image_name, "boinc"); strcpy(container_name, "boinc"); strcpy(aid.project_dir, "./project"); diff --git a/samples/docker_wrapper/test_copy/in b/samples/docker_wrapper/test_copy/infile similarity index 100% rename from samples/docker_wrapper/test_copy/in rename to samples/docker_wrapper/test_copy/infile From 445ef91d32e06698b756b8ab312d9a464aabd01b Mon Sep 17 00:00:00 2001 From: David Anderson Date: Tue, 29 Oct 2024 17:57:35 -0700 Subject: [PATCH 3/5] tweak --- samples/docker_wrapper/docker_wrapper.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/samples/docker_wrapper/docker_wrapper.cpp b/samples/docker_wrapper/docker_wrapper.cpp index 144520ff493..06fdfbe391d 100644 --- a/samples/docker_wrapper/docker_wrapper.cpp +++ b/samples/docker_wrapper/docker_wrapper.cpp @@ -30,7 +30,7 @@ // this is the first run of the job // if the image doesn't already exist // build image with 'docker build' -// (need a log around the above?) +// (need a lock around the above?) // create the container with -v to mount slot, project dirs // copy input files as needed // start container @@ -42,7 +42,7 @@ // image name // name: lower case letters, digits, separators (. _ -); max 4096 chars // tag: max 128 chars -// in the universal model, each WU has a different image +// in the universal model, each WU must have a different image // so we'll use: boinc____ // // container name: @@ -304,7 +304,7 @@ int container_exists(bool &exists) { int retval; vector out; - sprintf(cmd, "%s ps --filter \"name=%s\"", + sprintf(cmd, "%s ps --all --filter \"name=%s\"", cli_prog, container_name ); retval = run_docker_command(cmd, out); From e63c1d6a1f775153b6983c55fe76091db4a5a3db Mon Sep 17 00:00:00 2001 From: davidpanderson Date: Tue, 29 Oct 2024 23:21:26 -0700 Subject: [PATCH 4/5] docker wrapper: fix CPU time reporting --- samples/docker_wrapper/docker_wrapper.cpp | 43 ++++++++++++++++++----- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/samples/docker_wrapper/docker_wrapper.cpp b/samples/docker_wrapper/docker_wrapper.cpp index 06fdfbe391d..d31ebe6948d 100644 --- a/samples/docker_wrapper/docker_wrapper.cpp +++ b/samples/docker_wrapper/docker_wrapper.cpp @@ -219,7 +219,24 @@ inline int run_docker_command(char* cmd, vector &out) { retval = read_from_pipe( ctl_wc.out_read, ctl_wc.proc_handle, output, TIMEOUT, "EOM" ); - if (retval) return retval; + if (retval) { + const char* msg = ""; + switch (retval) { + case PROC_DIED: + msg = "Process died"; + break; + case TIMEOUT: + msg = "Timeout"; + break; + case READ_ERROR: + msg = "Read Error"; + break; + default: + break; + } + fprintf(stderr, "read_from_pipe() error: %s\n", msg); + return retval; + } out = split(output, '\n'); #else retval = run_command(cmd, out); @@ -481,9 +498,8 @@ int get_stats(RSC_USAGE &ru) { ); retval = run_docker_command(cmd, out); if (retval) return -1; - n = out.size(); - if (n == 0) return -1; - const char *buf = out[n-1].c_str(); + if (out.empty()) return -1; + const char *buf = out[0].c_str(); // output is like // 0.00% 420KiB / 503.8GiB double cpu_pct, mem; @@ -491,10 +507,18 @@ int get_stats(RSC_USAGE &ru) { n = sscanf(buf, "%lf%% %lf%c", &cpu_pct, &mem, &mem_unit); if (n != 3) return -1; switch (mem_unit) { - case 'G': mem *= GIGA; break; - case 'M': mem *= MEGA; break; - case 'K': mem *= KILO; break; - case 'B': break; + case 'G': + case 'g': + mem *= GIGA; break; + case 'M': + case 'm': + mem *= MEGA; break; + case 'K': + case 'k': + mem *= KILO; break; + case 'B': + case 'b': + break; default: return -1; } ru.cpu_frac = cpu_pct/100.; @@ -648,6 +672,9 @@ int main(int argc, char** argv) { retval = get_stats(ru); if (!retval) { cpu_time += STATUS_PERIOD*ru.cpu_frac; + if (verbose) { + fprintf(stderr, "reporting CPU %f WSS %f\n", cpu_time, ru.wss); + } boinc_report_app_status_aux( cpu_time, 0, // checkpoint CPU time From 984d0ab69a7452ee7edfd76796ef6dd2ecc14b85 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Wed, 30 Oct 2024 01:14:08 -0700 Subject: [PATCH 5/5] remove debugging #define --- client/app_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/client/app_test.cpp b/client/app_test.cpp index dbb9eb008ae..57a2a862cfd 100644 --- a/client/app_test.cpp +++ b/client/app_test.cpp @@ -50,7 +50,7 @@ // define exactly one -//#define APP_NONE +#define APP_NONE //#define APP_WSL_WRAPPER // type physical logical copy? // app wsl_wrapper.exe wsl_wrapper.exe @@ -58,7 +58,7 @@ // app main main yes // input infile in // output outfile out -#define APP_DOCKER_WRAPPER_COPY +//#define APP_DOCKER_WRAPPER_COPY // type physical logical copy? // app worker worker yes // app job_copy.toml job_copy.toml yes