From 656019f62ebb746e348bda3e915971503ee1f669 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jere=20Lepp=C3=A4nen?= Date: Mon, 14 Oct 2024 13:29:08 +0300 Subject: [PATCH 1/2] test: ml_perf: add ML performance test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add machine learning (ML) performance test application. The application measures nanoseconds used for various ML operations, such as creating and desroying a model, loading and unloading a model, and inferencing. Signed-off-by: Jere Leppänen Reviewed-by: Tuomas Taipale --- test/performance/.gitignore | 1 + test/performance/Makefile.am | 5 + test/performance/odp_ml_perf.c | 1096 ++++++++++++++++++++++++++++++++ 3 files changed, 1102 insertions(+) create mode 100644 test/performance/odp_ml_perf.c diff --git a/test/performance/.gitignore b/test/performance/.gitignore index a311d115d6..879a32d19b 100644 --- a/test/performance/.gitignore +++ b/test/performance/.gitignore @@ -19,6 +19,7 @@ odp_l2fwd odp_l2fwd_perf odp_lock_perf odp_mem_perf +odp_ml_perf odp_packet_gen odp_pktio_ordered odp_pktio_perf diff --git a/test/performance/Makefile.am b/test/performance/Makefile.am index b652262bdd..1a4e37eda5 100644 --- a/test/performance/Makefile.am +++ b/test/performance/Makefile.am @@ -42,6 +42,11 @@ if LIBCONFIG COMPILE_ONLY += odp_ipsecfwd endif +if WITH_ML +COMPILE_ONLY += odp_ml_perf +odp_ml_perf_SOURCES = odp_ml_perf.c +endif + TESTSCRIPTS = odp_cpu_bench_run.sh \ odp_crypto_run.sh \ odp_dma_perf_run.sh \ diff --git a/test/performance/odp_ml_perf.c b/test/performance/odp_ml_perf.c new file mode 100644 index 0000000000..13352468dc --- /dev/null +++ b/test/performance/odp_ml_perf.c @@ -0,0 +1,1096 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2024 Nokia + */ + +/** + * @example odp_ml_perf.c + * + * Performance test application for ML API + * + * @cond _ODP_HIDE_FROM_DOXYGEN_ + */ + +#include +#include +#include +#include +#include +#include +#include + +/* Max number of inputs and outputs */ +#define MAX_IO 8 + +enum { + MODE_INFERENCE = 0, + MODE_INFERENCE_QUANT, + MODE_CREATE, + MODE_LOAD, + MODE_NUM, +}; + +typedef struct { + int mode, num_threads, rounds, latency, warmup; + char *model_name, *input_name, *reference_name; + float scale_q, scale_d; + int num_batch; +} test_opt_t; + +static test_opt_t opt_def = { + .num_threads = 1, + .rounds = 50, + .warmup = 5, + .num_batch = 1, +}; + +typedef struct io_size { + uint64_t elems, size; + int elem_size; +} io_size; + +typedef struct ODP_ALIGNED_CACHE { + uint64_t sum, min, max; +} stat_t; + +typedef struct { + test_opt_t opt; + odp_barrier_t barrier; + odp_shm_t model_file_shm; + void *model_file_data; + uint64_t model_file_size; + odp_shm_t inp_file_shm; + void *inp_file_data; + uint64_t inp_file_size; + odp_shm_t ref_file_shm; + void *ref_file_data; + uint64_t ref_file_size; + odp_ml_model_t mdl; + odp_ml_capability_t capa; + odp_ml_model_info_t info; + int num_inp, num_out; + odp_ml_input_info_t inp_info[MAX_IO]; + io_size inp[MAX_IO]; + odp_ml_output_info_t out_info[MAX_IO]; + io_size out[MAX_IO]; + uint64_t inp_size_q, inp_size_d, out_size_q, out_size_d; + stat_t stat[ODP_THREAD_COUNT_MAX]; +} test_global_t; + +static test_global_t *glb; + +static void time_start(odp_time_t *t) +{ + *t = odp_time_local_strict(); +} + +static void time_elapsed(odp_time_t *t, int thr) +{ + odp_time_t stop = odp_time_local_strict(); + uint64_t nsec = odp_time_diff_ns(stop, *t); + + *t = stop; + glb->stat[thr].sum += nsec; + if (nsec < glb->stat[thr].min || !glb->stat[thr].min) + glb->stat[thr].min = nsec; + if (nsec > glb->stat[thr].max) + glb->stat[thr].max = nsec; +} + +static void *read_file(const char *name, uint64_t *size) +{ + void *addr = NULL; + FILE *file = fopen(name, "rb"); + + if (!file) { + ODPH_ERR("Failed to open file %s: %s\n", name, strerror(errno)); + return NULL; + } + + if (fseek(file, 0, SEEK_END)) { + ODPH_ERR("Failed to get file size for file %s\n", name); + goto error; + } + + long pos = ftell(file); + + if (pos < 0) { + ODPH_ERR("Failed to get file size for file %s\n", name); + goto error; + } + + rewind(file); + *size = pos; + addr = malloc(*size); + + if (!addr) { + ODPH_ERR("Allocating %" PRIu64 " bytes failed\n", *size); + goto error; + } + + if (fread(addr, *size, 1, file) != 1) { + ODPH_ERR("Reading %" PRIu64 " bytes failed\n", *size); + goto error; + } + + fclose(file); + printf("Read %" PRIu64 " bytes from %s\n", *size, name); + + return addr; + +error: + fclose(file); + free(addr); + + return NULL; +} + +static int read_file_to_shm(odp_shm_t *shm, void **data, uint64_t *size, const char *filename, + const char *name) +{ + void *file = read_file(filename, size); + + if (!file) + return -1; + + *shm = odp_shm_reserve(name, *size, ODP_CACHE_LINE_SIZE, 0); + + if (*shm != ODP_SHM_INVALID) + *data = odp_shm_addr(*shm); + + if (!*data) { + ODPH_ERR("Failed to reserve shm for %s\n", filename); + free(file); + return -1; + } + + memcpy(*data, file, *size); + free(file); + + return 0; +} + +static void usage(const char *prog) +{ + printf("\n" + "Usage: %s [options]\n" + "\n" + "Mandatory OPTIONS:\n" + " -M, --model Model file\n" + " -I, --input Input file\n" + "\n" + "Optional OPTIONS\n" + " -c, --num_cpu Number of CPUs (worker threads). 0: all available CPUs. Default 1.\n" + " -r, --rounds Number of test rounds. Default %u.\n" + " -m, --mode Test mode. Default 0.\n" + " 0: Inference\n" + " 1: Quantization-inference-dequantization\n" + " 2: Create-destroy\n" + " 3: Load-unload\n" + " -l, --latency Measure each round, report min, avg, max\n" + " -w, --warmup Warmup rounds. Default %d.\n" + " -R, --reference Reference file. To verify correctness, output from the last\n" + " inference is compared to this file.\n" + " -q, --quant Quantization scale\n" + " -d, --dequant Dequantization scale\n" + " -b, --batches Number of batches\n" + " -h, --help Help\n" + "\n", + prog, opt_def.rounds, opt_def.warmup); +} + +static int parse_args(int argc, char *argv[]) +{ + static const struct option longopts[] = { + { "model", required_argument, NULL, 'M' }, + { "input", required_argument, NULL, 'I' }, + { "num_cpu", required_argument, NULL, 'c' }, + { "rounds", required_argument, NULL, 'r' }, + { "mode", required_argument, NULL, 'm' }, + { "latency", no_argument, NULL, 'l' }, + { "warmup", required_argument, NULL, 'w' }, + { "reference", required_argument, NULL, 'R' }, + { "quant", required_argument, NULL, 'q' }, + { "dequant", required_argument, NULL, 'd' }, + { "batches", required_argument, NULL, 'b' }, + { "help", no_argument, NULL, 'h' }, + { NULL, 0, NULL, 0 } }; + + static const char *shortopts = "+M:I:c:r:m:lw:R:q:d:b:h"; + + glb->opt = opt_def; + + while (1) { + int c = getopt_long(argc, argv, shortopts, longopts, NULL); + + if (c == -1) + break; /* No more options */ + + switch (c) { + case 'M': + glb->opt.model_name = optarg; + break; + case 'I': + glb->opt.input_name = optarg; + break; + case 'c': + glb->opt.num_threads = atoi(optarg); + break; + case 'r': + glb->opt.rounds = atoi(optarg); + break; + case 'm': + glb->opt.mode = atoi(optarg); + break; + case 'l': + glb->opt.latency = 1; + break; + case 'w': + glb->opt.warmup = atoi(optarg); + break; + case 'R': + glb->opt.reference_name = optarg; + break; + case 'q': + glb->opt.scale_q = atof(optarg); + break; + case 'd': + glb->opt.scale_d = atof(optarg); + break; + case 'b': + glb->opt.num_batch = atoi(optarg); + break; + case 'h': + usage(argv[0]); + return 1; + default: + usage(argv[0]); + return -1; + } + } + + optind = 1; /* reset 'extern optind' from the getopt lib */ + + if (!glb->opt.model_name || !glb->opt.input_name) { + ODPH_ERR("Model and input files are mandatory\n"); + exit(EXIT_FAILURE); + } + + if (glb->opt.mode < 0 || glb->opt.mode >= MODE_NUM) { + ODPH_ERR("Invalid mode %d\n", glb->opt.mode); + exit(EXIT_FAILURE); + } + + printf("Options:\n"); + printf("--------\n"); + printf("model_name: %s\n", glb->opt.model_name); + printf("input_name: %s\n", glb->opt.input_name); + printf("num_cpu: %d\n", glb->opt.num_threads); + printf("rounds: %u\n", glb->opt.rounds); + printf("mode: %u\n", glb->opt.mode); + printf("latency: %d\n", glb->opt.latency); + printf("warmup: %d\n", glb->opt.warmup); + printf("reference_name: %s\n", glb->opt.reference_name); + printf("scale_q: %g\n", glb->opt.scale_q); + printf("scale_d: %g\n", glb->opt.scale_d); + printf("num_batch: %d\n", glb->opt.num_batch); + printf("\n"); + + return 0; +} + +static int check_num_batch(void) +{ + int min_batch = 1, max_batch = 1; + + for (int i = 0; i < glb->num_inp; i++) { + odp_ml_shape_info_t *shape = &glb->inp_info[i].shape; + + for (int j = 0; j < (int)shape->num_dim; j++) { + if (shape->dim[j] == ODP_ML_DIM_DYNAMIC) { + min_batch = shape->dim_min[j]; + max_batch = shape->dim_max[j]; + break; + } + } + } + + if (glb->opt.num_batch < min_batch || glb->opt.num_batch > max_batch) { + ODPH_ERR("Number of batches %d out of range [%d, %d]\n", glb->opt.num_batch, + min_batch, max_batch); + return -1; + } + + return 0; +} + +static void calc_io_size(void) +{ + for (int i = 0; i < glb->num_inp; i++) { + uint64_t elems = 1; + odp_ml_input_info_t *info = &glb->inp_info[i]; + odp_ml_shape_info_t *shape = &info->shape; + io_size *inp = &glb->inp[i]; + + printf("Input %d: %s, shape:", i, info->name); + + for (int j = 0; j < (int)shape->num_dim; j++) { + printf(" %d", shape->dim[j]); + if (shape->dim[j] != ODP_ML_DIM_DYNAMIC) + elems *= shape->dim[j]; + } + + if (shape->type == ODP_ML_SHAPE_BATCH) + elems *= glb->opt.num_batch; + inp->elems = elems; + inp->elem_size = info->data_type_size; + inp->size = elems * info->data_type_size; + glb->inp_size_q += inp->size; + glb->inp_size_d += elems * sizeof(float); + + printf(", elems: %" PRIu64 ", datatype size: %d, size: %" PRIu64 "\n", inp->elems, + inp->elem_size, inp->size); + } + + printf("Input size_q: %" PRIu64 ", size_d: %" PRIu64 "\n", glb->inp_size_q, + glb->inp_size_d); + + for (int i = 0; i < glb->num_out; i++) { + uint64_t elems = 1; + odp_ml_output_info_t *info = &glb->out_info[i]; + odp_ml_shape_info_t *shape = &info->shape; + io_size *out = &glb->out[i]; + + printf("Output %d: %s, shape:", i, info->name); + + for (int j = 0; j < (int)shape->num_dim; j++) { + printf(" %d", shape->dim[j]); + if (shape->dim[j] != ODP_ML_DIM_DYNAMIC) + elems *= shape->dim[j]; + } + + if (shape->type == ODP_ML_SHAPE_BATCH) + elems *= glb->opt.num_batch; + out->elems = elems; + out->elem_size = info->data_type_size; + out->size = elems * info->data_type_size; + glb->out_size_q += out->size; + glb->out_size_d += elems * sizeof(float); + + printf(", elems: %" PRIu64 ", datatype size: %d, size: %" PRIu64 "\n", out->elems, + out->elem_size, out->size); + } + + printf("Output size_q: %" PRIu64 ", size_d: %" PRIu64 "\n", glb->out_size_q, + glb->out_size_d); +} + +static int data_type_supported(odp_ml_data_type_t data_type) +{ + switch (data_type) { + case ODP_ML_DATA_TYPE_INT8: + case ODP_ML_DATA_TYPE_UINT8: + case ODP_ML_DATA_TYPE_FP16: + return 1; + default: + return 0; + } +} + +static void quantize_input(uint8_t *inp_q_addr, uint8_t *inp_d_addr) +{ + for (int i = 0; i < glb->num_inp; i++) { + float scale_q = glb->opt.scale_q; + uint64_t elems = glb->inp[i].elems; + odp_ml_data_type_t data_type = glb->inp_info[i].data_type; + + switch (data_type) { + case ODP_ML_DATA_TYPE_INT8: + odp_ml_fp32_to_int8((int8_t *)inp_q_addr, (float *)inp_d_addr, elems, + scale_q, 0); + break; + case ODP_ML_DATA_TYPE_UINT8: + odp_ml_fp32_to_uint8((uint8_t *)inp_q_addr, (float *)inp_d_addr, elems, + scale_q, 0); + break; + case ODP_ML_DATA_TYPE_FP16: + odp_ml_fp32_to_fp16((uint16_t *)inp_q_addr, (float *)inp_d_addr, elems); + break; + default: + ODPH_ERR("Unsupported type %d for input %d\n", data_type, i); + } + + inp_q_addr += glb->inp[i].size; + inp_d_addr += elems * sizeof(float); + } +} + +static void dequantize_output(uint8_t *out_d_addr, uint8_t *out_q_addr) +{ + for (int i = 0; i < glb->num_out; i++) { + float scale_d = glb->opt.scale_d; + uint64_t elems = glb->out[i].elems; + odp_ml_data_type_t data_type = glb->out_info[i].data_type; + + switch (data_type) { + case ODP_ML_DATA_TYPE_INT8: + odp_ml_fp32_from_int8((float *)out_d_addr, (int8_t *)out_q_addr, elems, + scale_d, 0); + break; + case ODP_ML_DATA_TYPE_UINT8: + odp_ml_fp32_from_uint8((float *)out_d_addr, (uint8_t *)out_q_addr, elems, + scale_d, 0); + break; + case ODP_ML_DATA_TYPE_FP16: + odp_ml_fp32_from_fp16((float *)out_d_addr, (uint16_t *)out_q_addr, elems); + break; + default: + ODPH_ERR("Unsupported type %d for output %d\n", data_type, i); + } + + out_q_addr += glb->out[i].size; + out_d_addr += elems * sizeof(float); + } +} + +static int test_ml(void *ptr) +{ + odp_time_t time; + odp_ml_data_seg_t inp_seg[MAX_IO]; + uint8_t *inp_addr; + odp_ml_data_seg_t out_seg[MAX_IO]; + uint8_t *out_addr; + int thread_idx = (int)(uintptr_t)ptr; + void *input = NULL, *output = NULL, *output_file = NULL; + void *output_final = NULL; + uint64_t out_size_final = 0; + int ret = 0; + + inp_addr = glb->inp_file_data; + + if (glb->opt.scale_q > 0.0) { + input = malloc(glb->inp_size_q); + if (!input) { + ODPH_ERR("Allocating %" PRIu64 " bytes failed\n", glb->inp_size_q); + ret = -1; + goto error; + } + + inp_addr = input; + } + + for (int i = 0; i < glb->num_inp; i++) { + inp_seg[i].addr = inp_addr; + inp_seg[i].size = glb->inp[i].size; + inp_addr += glb->inp[i].size; + } + + output = malloc(glb->out_size_q); + + if (!output) { + ODPH_ERR("Allocating %" PRIu64 " bytes failed\n", glb->out_size_q); + ret = -1; + goto error; + } + + out_addr = output; + output_final = output; + out_size_final = glb->out_size_q; + + if (glb->opt.scale_d > 0.0) { + output_file = malloc(glb->out_size_d); + if (!output_file) { + ODPH_ERR("Allocating %" PRIu64 " bytes failed\n", glb->out_size_d); + ret = -1; + goto error; + } + + output_final = output_file; + out_size_final = glb->out_size_d; + } + + for (int i = 0; i < glb->num_out; i++) { + out_seg[i].addr = out_addr; + out_seg[i].size = glb->out[i].size; + out_addr += glb->out[i].size; + } + + odp_ml_data_t data = { + .input_seg = inp_seg, + .num_input_seg = glb->num_inp, + .output_seg = out_seg, + .num_output_seg = glb->num_out, + }; + odp_ml_run_param_t run_param; + + odp_ml_run_param_init(&run_param); + run_param.batch_size = glb->opt.num_batch; + + if (glb->opt.mode != MODE_INFERENCE_QUANT && glb->opt.scale_q > 0.0) + quantize_input(input, glb->inp_file_data); + + odp_barrier_wait(&glb->barrier); + + for (int i = 0; i < glb->opt.rounds + glb->opt.warmup; i++) { + if (i == glb->opt.warmup) + time_start(&time); + + if (glb->opt.mode == MODE_INFERENCE_QUANT && glb->opt.scale_q > 0.0) + quantize_input(input, glb->inp_file_data); + + int r; + + while (!(r = odp_ml_run(glb->mdl, &data, &run_param))) + ; + if (r != 1) { + ODPH_ERR("odp_ml_run() failed\n"); + ret = -1; + goto error; + } + + if (glb->opt.mode == MODE_INFERENCE_QUANT && glb->opt.scale_d > 0.0) + dequantize_output(output_file, output); + + if (glb->opt.latency && i >= glb->opt.warmup) + time_elapsed(&time, thread_idx); + } + + if (!glb->opt.latency) + time_elapsed(&time, thread_idx); + + if (glb->opt.mode != MODE_INFERENCE_QUANT && glb->opt.scale_d > 0.0) + dequantize_output(output_file, output); + + if (glb->ref_file_data) { + if (out_size_final != glb->ref_file_size) { + ODPH_ERR("Output size mismatch: %" PRIu64 + " differs from reference file size %" PRIu64 "\n", + out_size_final, glb->ref_file_size); + ret = -1; + goto error; + } + + if (memcmp(glb->ref_file_data, output_final, out_size_final)) { + ODPH_ERR("Output differs from reference\n"); + ret = -1; + goto error; + } + } + +error: + free(input); + free(output); + free(output_file); + + return ret; +} + +static int test_ml_create(void *ptr) +{ + odp_time_t time; + int thread_idx = (int)(uintptr_t)ptr; + odp_ml_model_t mdl; + odp_ml_model_param_t model_param; + int ret = 0; + + odp_ml_model_param_init(&model_param); + model_param.model = glb->model_file_data; + model_param.size = glb->model_file_size; + + odp_barrier_wait(&glb->barrier); + + for (int i = 0; i < glb->opt.rounds + glb->opt.warmup; i++) { + if (i == glb->opt.warmup) + time_start(&time); + + mdl = odp_ml_model_create(glb->opt.model_name, &model_param); + if (mdl == ODP_ML_MODEL_INVALID) { + ODPH_ERR("odp_ml_model_create() failed\n"); + ret = -1; + goto error; + } + + if (odp_ml_model_destroy(mdl)) { + ODPH_ERR("odp_ml_model_destroy() failed\n"); + ret = -1; + goto error; + } + + if (glb->opt.latency && i >= glb->opt.warmup) + time_elapsed(&time, thread_idx); + } + + if (!glb->opt.latency) + time_elapsed(&time, thread_idx); + +error: + return ret; +} + +static int test_ml_load(void *ptr) +{ + odp_time_t time; + int thread_idx = (int)(uintptr_t)ptr; + odp_ml_model_t mdl; + odp_ml_model_param_t model_param; + int ret = 0; + + odp_ml_model_param_init(&model_param); + model_param.model = glb->model_file_data; + model_param.size = glb->model_file_size; + + mdl = odp_ml_model_create(glb->opt.model_name, &model_param); + if (mdl == ODP_ML_MODEL_INVALID) { + ODPH_ERR("odp_ml_model_create() failed\n"); + ret = -1; + goto error; + } + + odp_barrier_wait(&glb->barrier); + + for (int i = 0; i < glb->opt.rounds + glb->opt.warmup; i++) { + if (i == glb->opt.warmup) + time_start(&time); + + if (odp_ml_model_load(mdl, NULL)) { + ODPH_ERR("odp_ml_model_load() failed\n"); + ret = -1; + goto error; + } + + if (odp_ml_model_unload(mdl, NULL)) { + ODPH_ERR("odp_ml_model_unload() failed\n"); + ret = -1; + goto error; + } + + if (glb->opt.latency && i >= glb->opt.warmup) + time_elapsed(&time, thread_idx); + } + + if (!glb->opt.latency) + time_elapsed(&time, thread_idx); + + if (odp_ml_model_destroy(mdl)) { + ODPH_ERR("odp_ml_model_destroy() failed\n"); + ret = -1; + goto error; + } + +error: + return ret; +} + +static void print_results_avg(void) +{ + int num_threads = glb->opt.num_threads; + int rounds = glb->opt.rounds; + + printf("thread %15s %15s\n", "avg (nsec)", "rounds / sec"); + printf("--------------------------------------\n"); + if (num_threads > 1) { + uint64_t avg = 0; + + for (int i = 0; i < num_threads; i++) + avg += glb->stat[i].sum; + + avg /= rounds * num_threads; + printf("%-6s %15" PRIu64 " %15.1f\n", "all", avg, (float)ODP_TIME_SEC_IN_NS / avg); + } + for (int i = 0; i < num_threads; i++) { + glb->stat[i].sum /= rounds; + printf("%-6d %15" PRIu64 " %15.1f\n", i, glb->stat[i].sum, + (float)ODP_TIME_SEC_IN_NS / glb->stat[i].sum); + } +} + +static void print_stat(stat_t *stat) +{ + printf("%15" PRIu64 " %15" PRIu64 " %15" PRIu64, stat->min, stat->sum, stat->max); +} + +static void print_results_min_avg_max(void) +{ + int num_threads = glb->opt.num_threads; + int rounds = glb->opt.rounds; + + printf("thread %15s %15s %15s\n", "min (nsec)", "avg (nsec)", "max (nsec)"); + printf("------------------------------------------------------\n"); + if (num_threads > 1) { + stat_t s = { 0 }; + + for (int i = 0; i < num_threads; i++) { + s.sum += glb->stat[i].sum; + if (glb->stat[i].min && (glb->stat[i].min < s.min || !s.min)) + s.min = glb->stat[i].min; + if (glb->stat[i].max > s.max) + s.max = glb->stat[i].max; + } + + s.sum /= rounds * num_threads; + printf("%-6s ", "all"); + print_stat(&s); + printf("\n"); + } + for (int i = 0; i < num_threads; i++) { + glb->stat[i].sum /= rounds; + printf("%-6d ", i); + print_stat(&glb->stat[i]); + printf("\n"); + } +} + +static void print_results(void) +{ + printf("\n"); + + if (glb->opt.latency) + print_results_min_avg_max(); + else + print_results_avg(); + + printf("\n"); +} + +int main(int argc, char *argv[]) +{ + odp_instance_t inst; + odph_helper_options_t helper_options; + odp_init_t init; + odp_shm_t shm_glb; + int ret = 0; + + /* Let helper collect its own arguments (e.g. --odph_proc) */ + argc = odph_parse_options(argc, argv); + + if (odph_options(&helper_options)) { + ODPH_ERR("Failed to read ODP helper options.\n"); + exit(EXIT_FAILURE); + } + + /* List features not to be used */ + odp_init_param_init(&init); + init.not_used.feat.cls = 1; + init.not_used.feat.compress = 1; + init.not_used.feat.crypto = 1; + init.not_used.feat.dma = 1; + init.not_used.feat.ipsec = 1; + init.not_used.feat.schedule = 1; + init.not_used.feat.timer = 1; + init.not_used.feat.tm = 1; + init.mem_model = helper_options.mem_model; + + /* Init ODP before calling anything else */ + if (odp_init_global(&inst, &init, NULL)) { + ODPH_ERR("Global init failed.\n"); + exit(EXIT_FAILURE); + } + + /* Init this thread */ + if (odp_init_local(inst, ODP_THREAD_CONTROL)) { + ODPH_ERR("Local init failed.\n"); + exit(EXIT_FAILURE); + } + + odp_sys_info_print(); + + shm_glb = odp_shm_reserve("test_globals", sizeof(test_global_t), ODP_CACHE_LINE_SIZE, 0); + + if (shm_glb != ODP_SHM_INVALID) + glb = (test_global_t *)odp_shm_addr(shm_glb); + + if (!glb) { + ODPH_ERR("Failed to reserve shm\n"); + ret = -1; + goto odp_term; + } + + memset(glb, 0, sizeof(test_global_t)); + + ret = parse_args(argc, argv); + if (ret) { + if (ret > 0) + ret = 0; + goto odp_term; + } + + if (read_file_to_shm(&glb->model_file_shm, &glb->model_file_data, &glb->model_file_size, + glb->opt.model_name, "ml_perf_model")) { + ret = -1; + goto odp_term; + } + + if (read_file_to_shm(&glb->inp_file_shm, &glb->inp_file_data, &glb->inp_file_size, + glb->opt.input_name, "ml_perf_input")) { + ret = -1; + goto odp_term; + } + + if (glb->opt.reference_name) { + if (read_file_to_shm(&glb->ref_file_shm, &glb->ref_file_data, &glb->ref_file_size, + glb->opt.reference_name, "ml_perf_reference")) { + ret = -1; + goto odp_term; + } + } + + if (odp_ml_capability(&glb->capa)) { + ODPH_ERR("odp_ml_capability() failed\n"); + ret = -1; + goto odp_term; + } + + if (glb->capa.max_models < 1) { + ODPH_ERR("ML not supported (maximum number of models is zero)\n"); + ret = -1; + goto odp_term; + } + + if (glb->capa.max_model_size < glb->model_file_size) { + ODPH_ERR("Model size %" PRIu64 " exceeds maximum %" PRIu64 "\n", + glb->model_file_size, glb->capa.max_model_size); + ret = -1; + goto odp_term; + } + + if (glb->capa.min_input_align > 1) { + ODPH_ERR("Minimum input alignment %u not supported\n", glb->capa.min_input_align); + ret = -1; + goto odp_term; + } + + if (glb->capa.min_output_align > 1) { + ODPH_ERR("Minimum output alignment %u not supported\n", glb->capa.min_output_align); + ret = -1; + goto odp_term; + } + + if ((glb->opt.mode == MODE_CREATE || glb->opt.mode == MODE_LOAD) && + (int)glb->capa.max_models < glb->opt.num_threads) { + ODPH_ERR("Maximum number of models %u less than number of threads %d\n", + glb->capa.max_models, glb->opt.num_threads); + ret = -1; + goto odp_term; + } + + if (glb->opt.mode == MODE_LOAD && (int)glb->capa.max_models_loaded < glb->opt.num_threads) { + ODPH_ERR("Maximum number of models loaded %u less than number of threads %d\n", + glb->capa.max_models_loaded, glb->opt.num_threads); + ret = -1; + goto odp_term; + } + + odp_ml_config_t ml_config; + odp_ml_model_param_t model_param; + + odp_ml_config_init(&ml_config); + ml_config.max_model_size = glb->capa.max_model_size; + ml_config.load_mode_mask = ODP_ML_COMPL_MODE_SYNC; + ml_config.run_mode_mask = ODP_ML_COMPL_MODE_SYNC; + + if (odp_ml_config(&ml_config)) { + ODPH_ERR("odp_ml_config() failed\n"); + ret = -1; + goto odp_term; + } + + odp_ml_model_param_init(&model_param); + model_param.model = glb->model_file_data; + model_param.size = glb->model_file_size; + + glb->mdl = odp_ml_model_create(glb->opt.model_name, &model_param); + if (glb->mdl == ODP_ML_MODEL_INVALID) { + ODPH_ERR("odp_ml_model_create() failed\n"); + ret = -1; + goto odp_term; + } + + odp_ml_model_print(glb->mdl); + + if (odp_ml_model_load(glb->mdl, NULL)) { + ODPH_ERR("odp_ml_model_load() failed\n"); + ret = -1; + goto odp_term; + } + + if (odp_ml_model_info(glb->mdl, &glb->info)) { + ODPH_ERR("odp_ml_model_info() failed\n"); + ret = -1; + goto odp_term; + } + + glb->num_inp = odp_ml_model_input_info(glb->mdl, glb->inp_info, MAX_IO); + + if (glb->num_inp < 0 || glb->num_inp > MAX_IO) { + ODPH_ERR("odp_ml_model_input_info() failed, or too many inputs\n"); + ret = -1; + goto odp_term; + } + + glb->num_out = odp_ml_model_output_info(glb->mdl, glb->out_info, MAX_IO); + + if (glb->num_out < 0 || glb->num_out > MAX_IO) { + ODPH_ERR("odp_ml_model_output_info() failed, or too many outputs\n"); + ret = -1; + goto odp_term; + } + + if (check_num_batch()) { + ret = -1; + goto odp_term; + } + + calc_io_size(); + + if (glb->opt.scale_q > 0.0) { + for (int i = 0; i < glb->num_inp; i++) { + if (!data_type_supported(glb->inp_info[i].data_type)) { + ODPH_ERR("Unsupported data type %d for input %d quantization\n", + glb->inp_info[i].data_type, i); + ret = -1; + goto odp_term; + } + } + for (int i = 0; i < glb->num_out; i++) { + if (!data_type_supported(glb->out_info[i].data_type)) { + ODPH_ERR("Unsupported data type %d for output %d dequantization\n", + glb->out_info[i].data_type, i); + ret = -1; + goto odp_term; + } + } + } + + if ((glb->opt.scale_q > 0.0 && glb->inp_file_size != glb->inp_size_d) || + (!(glb->opt.scale_q > 0.0) && glb->inp_file_size != glb->inp_size_q)) { + ODPH_ERR("Input file size mismatch\n"); + ret = -1; + goto odp_term; + } + + if (glb->opt.mode != MODE_INFERENCE && glb->opt.mode != MODE_INFERENCE_QUANT) { + const odp_ml_model_t mdl = glb->mdl; + + glb->mdl = ODP_ML_MODEL_INVALID; + + if (odp_ml_model_unload(mdl, NULL)) { + ODPH_ERR("odp_ml_model_unload() failed\n"); + ret = -1; + goto odp_term; + } + + if (odp_ml_model_destroy(mdl)) { + ODPH_ERR("odp_ml_model_destroy() failed\n"); + ret = -1; + goto odp_term; + } + } + + int num_threads = glb->opt.num_threads; + + odp_barrier_init(&glb->barrier, num_threads); + + odp_cpumask_t cpumask; + odph_thread_common_param_t thr_common; + odph_thread_param_t thr_param[ODP_THREAD_COUNT_MAX]; + odph_thread_t thr_worker[ODP_THREAD_COUNT_MAX]; + odph_thread_join_result_t res[ODP_THREAD_COUNT_MAX]; + + if (odp_cpumask_default_worker(&cpumask, num_threads) != num_threads) { + ODPH_ERR("Failed to get default CPU mask.\n"); + ret = -1; + goto odp_term; + } + + odph_thread_common_param_init(&thr_common); + thr_common.instance = inst; + thr_common.cpumask = &cpumask; + + for (int i = 0; i < num_threads; i++) { + odph_thread_param_init(&thr_param[i]); + thr_param[i].thr_type = ODP_THREAD_WORKER; + thr_param[i].arg = (void *)(uintptr_t)i; + switch (glb->opt.mode) { + case MODE_INFERENCE: + case MODE_INFERENCE_QUANT: + thr_param[i].start = test_ml; + break; + case MODE_CREATE: + thr_param[i].start = test_ml_create; + break; + case MODE_LOAD: + thr_param[i].start = test_ml_load; + break; + } + } + + memset(&thr_worker, 0, sizeof(thr_worker)); + + if (odph_thread_create(thr_worker, &thr_common, thr_param, num_threads) != num_threads) { + ODPH_ERR("Failed to create worker threads.\n"); + ret = -1; + goto odp_term; + } + + if (odph_thread_join_result(thr_worker, res, num_threads) != num_threads) { + ODPH_ERR("Failed to join worker threads.\n"); + ret = -1; + goto odp_term; + } + + for (int i = 0; i < num_threads; i++) { + if (res[i].is_sig || res[i].ret != 0) { + ODPH_ERR("Worker thread failure%s: %d\n", + res[i].is_sig ? " (signaled)" : "", res[i].ret); + ret = -1; + goto odp_term; + } + } + + print_results(); + +odp_term: + + if (glb->mdl != ODP_ML_MODEL_INVALID) { + if (odp_ml_model_unload(glb->mdl, NULL)) { + ODPH_ERR("odp_ml_model_unload() failed\n"); + ret = -1; + } + + if (odp_ml_model_destroy(glb->mdl)) { + ODPH_ERR("odp_ml_model_destroy() failed\n"); + ret = -1; + } + } + + if (glb->model_file_shm != ODP_SHM_INVALID && odp_shm_free(glb->model_file_shm)) { + ODPH_ERR("odp_shm_free() failed\n"); + ret = -1; + } + + if (glb->inp_file_shm != ODP_SHM_INVALID && odp_shm_free(glb->inp_file_shm)) { + ODPH_ERR("odp_shm_free() failed\n"); + ret = -1; + } + + if (glb->ref_file_shm != ODP_SHM_INVALID && odp_shm_free(glb->ref_file_shm)) { + ODPH_ERR("odp_shm_free() failed\n"); + ret = -1; + } + + if (shm_glb != ODP_SHM_INVALID && odp_shm_free(shm_glb)) { + ODPH_ERR("odp_shm_free() failed\n"); + ret = -1; + } + + if (odp_term_local()) { + ODPH_ERR("Local term failed\n"); + return -1; + } + + if (odp_term_global(inst)) { + ODPH_ERR("Global term failed\n"); + return -1; + } + + return ret; +} From aa00ec1d7a802315329f97661863b97c1d54214e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jere=20Lepp=C3=A4nen?= Date: Fri, 25 Oct 2024 14:34:17 +0300 Subject: [PATCH 2/2] linux-gen: perf: add odp_ml_perf_run.sh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ML performance test run for linux-gen. This runs some basic performance tests using the conv.onnx model. Signed-off-by: Jere Leppänen Reviewed-by: Tuomas Taipale --- .../test/performance/Makefile.am | 26 +++++++++++++++++++ .../test/performance/odp_ml_perf_run.sh | 20 ++++++++++++++ 2 files changed, 46 insertions(+) create mode 100755 platform/linux-generic/test/performance/odp_ml_perf_run.sh diff --git a/platform/linux-generic/test/performance/Makefile.am b/platform/linux-generic/test/performance/Makefile.am index 4070f09f2e..2a05607015 100644 --- a/platform/linux-generic/test/performance/Makefile.am +++ b/platform/linux-generic/test/performance/Makefile.am @@ -1 +1,27 @@ SUBDIRS = dmafwd + +if WITH_ML +TESTS = odp_ml_perf_run.sh +EXTRA_DIST = odp_ml_perf_run.sh +endif + +# If building out-of-tree, make check will not copy the scripts and data to the +# $(builddir) assuming that all commands are run locally. However this prevents +# running tests on a remote target using LOG_COMPILER. +# So copy all script and data files explicitly here. +all-local: + if [ "x$(srcdir)" != "x$(builddir)" ]; then \ + for f in $(EXTRA_DIST); do \ + if [ -e $(srcdir)/$$f ]; then \ + mkdir -p $(builddir)/$$(dirname $$f); \ + cp -f $(srcdir)/$$f $(builddir)/$$f; \ + fi \ + done \ + fi + +clean-local: + if [ "x$(srcdir)" != "x$(builddir)" ]; then \ + for f in $(EXTRA_DIST); do \ + rm -f $(builddir)/$$f; \ + done \ + fi diff --git a/platform/linux-generic/test/performance/odp_ml_perf_run.sh b/platform/linux-generic/test/performance/odp_ml_perf_run.sh new file mode 100755 index 0000000000..8ddd086866 --- /dev/null +++ b/platform/linux-generic/test/performance/odp_ml_perf_run.sh @@ -0,0 +1,20 @@ +#!/bin/sh -xe +# +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024 Nokia + +TEST_DIR="${TEST_DIR:-$(dirname $0)}" + +cd $TEST_DIR +BIN_DIR=../../../../test/performance +MODEL_DIR=../../example/ml + +run_conv() { + $BIN_DIR/odp_ml_perf${EXEEXT} -M $MODEL_DIR/conv.onnx \ + -I $MODEL_DIR/conv-input.bin -R $MODEL_DIR/conv-output.bin $@ +} + +run_conv -m 0 -r 1000 +run_conv -m 0 -r 1000 -l +run_conv -m 2 -l +run_conv -m 3 -l