From 650085b2f3643aad05c629425983491d63b5c289 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Tue, 20 Sep 2022 13:41:41 -0700 Subject: [PATCH] examples: add matmul performance test --- examples/matmul_perf.cpp | 271 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 271 insertions(+) create mode 100644 examples/matmul_perf.cpp diff --git a/examples/matmul_perf.cpp b/examples/matmul_perf.cpp new file mode 100644 index 00000000000..ff09ef08188 --- /dev/null +++ b/examples/matmul_perf.cpp @@ -0,0 +1,271 @@ +/******************************************************************************* +* Copyright 2022 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "example_utils.hpp" +#include "oneapi/dnnl/dnnl.hpp" + +using namespace dnnl; + +using tag = memory::format_tag; +using dt = memory::data_type; + +struct gemm_dims_t { + memory::dim m, n, k; +}; + +static const int min_runs = 4; + +const char *get_type_string(dt type) { + const char *type_string = "unknown"; + +#define TYPE_CASE(T) \ + if (type == dt::T) type_string = #T; + TYPE_CASE(f16); + TYPE_CASE(f32); + TYPE_CASE(f64); + TYPE_CASE(bf16); + TYPE_CASE(s8); + TYPE_CASE(u8); +#undef TYPE_CASE + + return type_string; +} + +void print_test_case(dt type, gemm_dims_t dims) { + std::cout << '[' << std::setw(4) << get_type_string(type); + if (dims.m == dims.n && dims.m == dims.k) + std::cout << " m = n = k = " << dims.m; + else + std::cout << " m = " << dims.m << ", n = " << dims.n + << ", k = " << dims.k; + std::cout << "] " << std::flush; +} + +void fill_random(std::vector &out, bool is_integer) { + static std::vector random_data_i, random_data_f; + constexpr size_t nrand = 1037; + + if (random_data_i.empty() || random_data_f.empty()) { + std::mt19937 generator; + std::uniform_int_distribution dist_i(-16, 15); + std::uniform_real_distribution dist_f(-1.0f, 1.0f); + + random_data_i.resize(nrand); + for (auto &d : random_data_i) + d = dist_i(generator); + + random_data_f.resize(nrand); + for (auto &d : random_data_f) + d = dist_f(generator); + } + + auto &rd = is_integer ? random_data_i : random_data_f; + + for (size_t i = 0; i < out.size(); i += nrand) { + size_t chunk = std::min(nrand, out.size() - i); + std::memcpy(&out[i], rd.data(), chunk * sizeof(float)); + } +} + +double run_case(engine::kind engine_kind, dt type, gemm_dims_t dims, + double time_limit = 0.) { + bool is_integer = (type == dt::s8 || type == dt::u8); + bool quick_test = (time_limit == 0.); + + // Create execution dnnl::engine. + dnnl::engine engine(engine_kind, 0); + + // Create dnnl::stream. + dnnl::stream engine_stream(engine); + + // Source (A), weights (B), and destination (C) matrix dimensions. + memory::dims a_dims = {dims.m, dims.k}; + memory::dims b_dims = {dims.k, dims.n}; + memory::dims c_dims = {dims.m, dims.n}; + + // Allocate buffers and random-initialize A/B + std::vector a_data(product(a_dims)); + std::vector b_data(product(b_dims)); + std::vector c_data(product(c_dims)); + + fill_random(a_data, is_integer); + fill_random(b_data, is_integer); + + // Create memory descriptors and memory objects for src, weights, bias, and + // dst. + auto a_md = memory::desc(a_dims, type, tag::any); + auto b_md = memory::desc(b_dims, type, tag::any); + auto c_md = memory::desc(c_dims, type, tag::any); + + auto a_in_md = memory::desc(a_dims, dt::f32, tag::ab); + auto b_in_md = memory::desc(b_dims, dt::f32, tag::ab); + + auto a_in_mem = memory(a_in_md, engine); + auto b_in_mem = memory(b_in_md, engine); + + // Write data to memory object's handles. + write_to_dnnl_memory(a_data.data(), a_in_mem); + write_to_dnnl_memory(b_data.data(), b_in_mem); + + // Create operation descriptor + auto matmul_d = matmul::desc(a_md, b_md, c_md); + + // Create primitive descriptor. + auto matmul_pd = matmul::primitive_desc(matmul_d, engine); + + // Repack and convert input data. + auto a_mem = memory(matmul_pd.src_desc(), engine); + reorder(a_in_mem, a_mem).execute(engine_stream, a_in_mem, a_mem); + + auto b_mem = memory(matmul_pd.weights_desc(), engine); + reorder(b_in_mem, b_mem).execute(engine_stream, b_in_mem, b_mem); + + auto c_mem = memory(matmul_pd.dst_desc(), engine); + + // Create the primitive. + auto matmul_prim = matmul(matmul_pd); + + // Start output. + if (!quick_test) print_test_case(type, dims); + + // Primitive arguments. + std::unordered_map matmul_args; + matmul_args.insert({DNNL_ARG_SRC, a_mem}); + matmul_args.insert({DNNL_ARG_WEIGHTS, b_mem}); + matmul_args.insert({DNNL_ARG_DST, c_mem}); + + // Warmup executions. + matmul_prim.execute(engine_stream, matmul_args); + engine_stream.wait(); + + auto start_first = std::chrono::steady_clock::now(); + matmul_prim.execute(engine_stream, matmul_args); + engine_stream.wait(); + auto end_first = std::chrono::steady_clock::now(); + + std::chrono::duration dur_first = end_first - start_first; + + if (quick_test) return dur_first.count(); + + int runs = std::max(min_runs, int(time_limit / dur_first.count())); + + // Timing runs. + auto start = std::chrono::steady_clock::now(); + + for (int i = 0; i <= runs; i++) + matmul_prim.execute(engine_stream, matmul_args); + engine_stream.wait(); + + auto end = std::chrono::steady_clock::now(); + + std::chrono::duration duration = end - start; + + // Display the result. + double avg_time = (duration.count() - dur_first.count()) / runs; + double total_ops = double(dims.m) * double(dims.n) * double(dims.k) * 2; + double perf = (total_ops / avg_time) * 1e-9; + + auto scale_string = "G"; + auto unit_string = is_integer ? "Op/s" : "Flop/s"; + + if (perf >= 1000) { + perf /= 1000; + scale_string = "T"; + } + + std::cout << perf << ' ' << scale_string << unit_string << std::endl; + + return avg_time; +} + +void run(engine::kind engine_kind, dt type, gemm_dims_t dims, + double time_limit) { + try { + if (dims.m * dims.n != 0) { + // Dimensions manually specified by user. + run_case(engine_kind, type, dims, time_limit); + } else { + // Automatically choose dimensions to fit time limit. + int mnk = 128; + const int max_mnk = 8192; + + while (mnk < max_mnk) { + dims.m = dims.n = dims.k = mnk; + double time1 = run_case(engine_kind, type, dims); + double nruns_est = std::max(1., time_limit / time1); + double mnk_expand = std::exp2( + std::round(std::log2(nruns_est / min_runs) / 3.)); + if (mnk_expand <= 1) break; + mnk = std::min(max_mnk, mnk * mnk_expand); + } + + dims.m = dims.n = dims.k = mnk; + run_case(engine_kind, type, dims, time_limit); + } + } catch (dnnl::error &e) { + // Catch and report unimplemented cases. + if (e.status == dnnl_unimplemented) { + print_test_case(type, dims); + std::cout << "unsupported" << std::endl; + } else + throw; + } +} + +void bad_args() { + std::cerr << "Usage: matmul-perf-cpp [cpu|gpu]\n" + " matmul-perf-cpp [cpu|gpu] \n" + " matmul-perf-cpp [cpu|gpu] \n" + "If a single is specified, it is used for all three " + "dimensions (m/n/k).\n"; + throw std::invalid_argument("Incorrect input arguments."); +} + +void matmul_perf(engine::kind engine_kind, int argc, char **argv) { + gemm_dims_t dims = {0, 0, 0}; + + if (argc > 2) { + if (argc == 3) + dims.m = dims.n = dims.k = std::atoi(argv[2]); + else if (argc == 5) { + dims.m = std::atoi(argv[2]); + dims.n = std::atoi(argv[3]); + dims.k = std::atoi(argv[4]); + } else + bad_args(); + + if (dims.m <= 0 || dims.n <= 0 || dims.k <= 0) bad_args(); + } + + run(engine_kind, dt::f32, dims, 2.0); + run(engine_kind, dt::f16, dims, 2.0); + run(engine_kind, dt::bf16, dims, 2.0); + run(engine_kind, dt::s8, dims, 2.0); +} + +int main(int argc, char **argv) { + return handle_example_errors( + matmul_perf, parse_engine_kind(argc, argv, 3), argc, argv); +}