From 67845b1ac7cd79954911dc6a235eadf4f7c47502 Mon Sep 17 00:00:00 2001 From: Hannes T Date: Mon, 20 Jun 2022 17:57:39 +0200 Subject: [PATCH 1/4] add raw pattern generator To allow more generic user-driven configuration of CPU patterns the new `raw_pattern()` accepts a vector of bool which CPUs should be enabled or disabled. This can be considered a wrapper to transform a list of bools to the set-based internal representation to be more in line with the existing pattern generators. --- .../patterns/pattern_generator.hpp | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/include/roco2/experiments/patterns/pattern_generator.hpp b/include/roco2/experiments/patterns/pattern_generator.hpp index 877fcc1..391ea40 100644 --- a/include/roco2/experiments/patterns/pattern_generator.hpp +++ b/include/roco2/experiments/patterns/pattern_generator.hpp @@ -152,6 +152,16 @@ namespace experiments return result; } + + // example for 24 CPUs with stride_pattern(1, 8) + // [#.......#.......#] + // [##......##......##] + // [###.....###.....###] + // [####....####....####] + // [#####...#####...#####] + // [######..######..######] + // [#######.#######.#######] + // [########################] inline pattern stride_pattern(std::size_t block_size, std::size_t stride_size) { if (omp_get_num_threads() % block_size != 0) @@ -197,6 +207,33 @@ namespace experiments return result; } + + /** + * enqueues a single pattern as provided by the on_list + */ + inline pattern raw_pattern(std::vector on_list) + { + if (static_cast(omp_get_num_threads()) != on_list.size()) + { + roco2::log::warn() << "requested number of CPUs (" << on_list.size() << ")" + << "does NOT match number of available threads (" + << omp_get_num_threads() + << ")"; + } + + + cpu_sets::cpu_set cpu_set_range; + + for (std::size_t num_cpu = 0; num_cpu < on_list.size(); num_cpu++) { + if (on_list[num_cpu]) { + cpu_set_range.add(num_cpu); + } + } + + pattern result; + result.append(cpu_set_range); + return result; + } } } } From 2e43b0f8f6b84a5d345d34534db415ab18952c21 Mon Sep 17 00:00:00 2001 From: Hannes T Date: Mon, 20 Jun 2022 18:01:11 +0200 Subject: [PATCH 2/4] add configuration for kallisto The configuration for the test system "kallisto" uses a alderlake processor, i.e. there are different types of cores. This newly added test aims to explore the behavior of these cores. It is not finished and in its current state mainly handles the pattern creation, i.e. the mapping of function to CPU number. --- src/configurations/CMakeLists.txt | 1 + src/configurations/kallisto/CMakeLists.txt | 4 + src/configurations/kallisto/experiment.cpp | 225 +++++++++++++++++++++ 3 files changed, 230 insertions(+) create mode 100644 src/configurations/kallisto/CMakeLists.txt create mode 100644 src/configurations/kallisto/experiment.cpp diff --git a/src/configurations/CMakeLists.txt b/src/configurations/CMakeLists.txt index 8f3dd62..afc0401 100644 --- a/src/configurations/CMakeLists.txt +++ b/src/configurations/CMakeLists.txt @@ -5,3 +5,4 @@ add_subdirectory(example) add_subdirectory(conway) add_subdirectory(p9_highlow) add_subdirectory(p9_longrun) +add_subdirectory(kallisto) diff --git a/src/configurations/kallisto/CMakeLists.txt b/src/configurations/kallisto/CMakeLists.txt new file mode 100644 index 0000000..2cd8835 --- /dev/null +++ b/src/configurations/kallisto/CMakeLists.txt @@ -0,0 +1,4 @@ +if(CpuFreq_FOUND) + Roco2Configuration(kallisto) +endif() + diff --git a/src/configurations/kallisto/experiment.cpp b/src/configurations/kallisto/experiment.cpp new file mode 100644 index 0000000..a2b8cc8 --- /dev/null +++ b/src/configurations/kallisto/experiment.cpp @@ -0,0 +1,225 @@ +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +using namespace roco2::experiments::patterns; + +enum class core_type { + non_ht, + ht, + ecore, +}; + +/** + * returns number of linux CPU for number of core type + * e.g. + * + * | non_ht | 0 | 0 | + * | non_ht | 1 | 2 | + * | ht | 1 | 3 | + * | ht | 2 | 5 | + * | ecore | 0 | 16 | + * | ecore | 1 | 17 | + */ +static int get_linux_cpu_num(core_type cpu_type, int cpu_num) { + assert(cpu_num >= 0 && cpu_num <= 7); + + switch(cpu_type) { + case core_type::non_ht: + return cpu_num * 2; + case core_type::ht: + return 1 + cpu_num * 2; + case core_type::ecore: + return 16 + cpu_num; + default: + throw std::runtime_error("unkown CPU type"); + }; +} + +/// retrieve CPU patterns for alderlake +static pattern get_alderlake_patterns() { + // this function generates the patterns for alderlake + // due to the mapping of CPU num -> their functionality the traditional roco2 pattern generators are not applicable + + // the CPUs are mapped as follows: + // 0-15 P-cores + // 16-23 E-cores + // for P-cores where **even** numbers are the cores, and the following **odd** number is the corresponding HT cpu + // + // $ lscpu --all --extended + // CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ + // 0 0 0 0 0:0:0:0 yes 3201.0000 800.0000 800.000 + // 1 0 0 0 0:0:0:0 yes 3201.0000 800.0000 3201.000 + // 2 0 0 1 4:4:1:0 yes 3201.0000 800.0000 800.000 + // 3 0 0 1 4:4:1:0 yes 3201.0000 800.0000 3201.000 + // 4 0 0 2 8:8:2:0 yes 3201.0000 800.0000 800.000 + // ... + // + // this is handled by the method get_linux_cpu_num() above + + pattern all_patterns; + + // Part A: non-HT, HT, E-cores separate + for (core_type cpu_type : {core_type::non_ht, core_type::ht, core_type::ecore}) { + std::vector on_list(24); + // build triangle + for (int max_enabled_cpu = 0; max_enabled_cpu < 8; max_enabled_cpu++) { + // note: CPUs from previous iteration are still enabled, no need to re-build triangle every iteration + on_list[get_linux_cpu_num(cpu_type, max_enabled_cpu)] = true; + + all_patterns = all_patterns >> raw_pattern(on_list); + } + } + + // Part B: enabling non-HT, HT, E-cores subsequently (in that order) + { + std::vector on_list(24); + for (core_type cpu_type : {core_type::non_ht, core_type::ht, core_type::ecore}) { + for (int max_enabled_cpu = 0; max_enabled_cpu < 8; max_enabled_cpu++) { + on_list[get_linux_cpu_num(cpu_type, max_enabled_cpu)] = true; + all_patterns = all_patterns >> raw_pattern(on_list); + } + } + } + + // Part C: enable first non-HT, HT, E-core, then second non-HT, HT, E-core... + { + std::vector on_list(24); + for (int max_enabled_cpu = 0; max_enabled_cpu < 8; max_enabled_cpu++) { + on_list[get_linux_cpu_num(core_type::ht, max_enabled_cpu)] = true; + on_list[get_linux_cpu_num(core_type::non_ht, max_enabled_cpu)] = true; + on_list[get_linux_cpu_num(core_type::ecore, max_enabled_cpu)] = true; + + all_patterns = all_patterns >> raw_pattern(on_list); + } + } + + return all_patterns; +} + +void run_experiments(roco2::chrono::time_point starting_point, bool eta_only) +{ + roco2::kernels::busy_wait bw; + roco2::kernels::compute cp; + roco2::kernels::sinus sinus; + roco2::kernels::memory_read<> mem_rd; + roco2::kernels::memory_copy<> mem_cpy; + roco2::kernels::memory_write<> mem_wrt; + roco2::kernels::sqrt squareroot; + roco2::kernels::matmul mm; + roco2::kernels::idle idle; + roco2::kernels::mulpd mulpd; + roco2::kernels::addpd addpd; + + roco2::cpu::frequency freqctl; + + roco2::memory::numa_bind_local nbl; + + // ------ EDIT GENERIC SETTINGS BELOW THIS LINE ------ + + auto experiment_duration = std::chrono::seconds(10); + + // copied from: /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies + // note that both P- and E-cores report **the same** available frequencies + auto freq_list = std::vector{ 3201, 3200, 2500, 1700, 800 }; + + //auto on_list = get_alderlake_patterns(); + auto on_list = {roco2::experiments::cpu_sets::all_cpus()}; + + // ------ EDIT GENERIC SETTINGS ABOVE THIS LINE ------ + + roco2::task::task_plan plan; + +#pragma omp master + { + //roco2::log::info() << "Experiment list has " << on_list.size() << " entries: " << on_list; + } + +#pragma omp barrier + + auto experiment_startpoint = + roco2::initialize::thread(starting_point, experiment_duration, eta_only); + + roco2::experiments::const_lenght exp(experiment_startpoint, experiment_duration); + + auto experiment = [&](auto& kernel, const auto& on) { + plan.push_back(roco2::task::experiment_task(exp, kernel, on)); + }; + + auto setting = [&](auto lambda) { plan.push_back(roco2::task::make_lambda_task(lambda)); }; + + // ------ EDIT TASK PLAN BELOW THIS LINE ------ + + setting([&freqctl, &freq_list]() { freqctl.change(freq_list[0]); }); + + // for each frequency + for (const auto& freq : freq_list) + { + setting([&freqctl, freq]() { freqctl.change(freq); }); + + // for reference + experiment(idle, roco2::experiments::cpu_sets::all_cpus()); + + for (const auto& on : on_list) + { + experiment(bw, on); + experiment(cp, on); + // experiment(sinus, on); + experiment(mem_rd, on); + experiment(mem_cpy, on); + experiment(mem_wrt, on); + experiment(addpd, on); + experiment(mulpd, on); + experiment(squareroot, on); + experiment(mm, on); + } + } + + // ------ EDIT TASK PLAN ABOVE THIS LINE ------ + +#pragma omp master + { + roco2::log::info() << "ETA for whole execution: " + << std::chrono::duration_cast(plan.eta()); + } + + if (!eta_only) + { +#pragma omp barrier + + plan.execute(); + } +} From 3dd7d84f6282ebfdf4346cde89fe6f85c8eef779 Mon Sep 17 00:00:00 2001 From: Hannes T Date: Mon, 20 Jun 2022 18:30:16 +0200 Subject: [PATCH 3/4] clean up kallisto configuration Add documentation and a run script for kallisto configuration. Additionally, remove useless comments and switch to production cpu patterns/frequencies. --- src/configurations/kallisto/CMakeLists.txt | 5 +++- src/configurations/kallisto/README.md | 30 ++++++++++++++++++++++ src/configurations/kallisto/experiment.cpp | 8 ++---- src/configurations/kallisto/run.sh.in | 23 +++++++++++++++++ 4 files changed, 59 insertions(+), 7 deletions(-) create mode 100644 src/configurations/kallisto/README.md create mode 100644 src/configurations/kallisto/run.sh.in diff --git a/src/configurations/kallisto/CMakeLists.txt b/src/configurations/kallisto/CMakeLists.txt index 2cd8835..b72d516 100644 --- a/src/configurations/kallisto/CMakeLists.txt +++ b/src/configurations/kallisto/CMakeLists.txt @@ -1,4 +1,7 @@ -if(CpuFreq_FOUND) +if(CpuFreq_FOUND AND USE_ASM_KERNELS) Roco2Configuration(kallisto) + + configure_file("${CMAKE_CURRENT_LIST_DIR}/run.sh.in" "${CMAKE_CURRENT_BINARY_DIR}/run.sh" ESCAPE_QUOTES) endif() + diff --git a/src/configurations/kallisto/README.md b/src/configurations/kallisto/README.md new file mode 100644 index 0000000..157ffe2 --- /dev/null +++ b/src/configurations/kallisto/README.md @@ -0,0 +1,30 @@ +# Kallisto +This configuration executes workloads on the different types of cores of the test alderlake CPU **separately**. + +The following CPU "triangles" are tested: + +1. all non-HT, then all HT, then all E-cores (each triangle separate from each other) +2. enabling non-HT, HT, E-cores subsequently (in that order, i.e. "one big triangle") +3. enable first non-HT, HT, E-core (all simultaneously), then second non-HT, HT, E-core... + +The CPU mapping is non-standard, please refer to the comments in the source file for details. + +## Requirements +- Score-P +- cpufreq + +## Building + +```bash +SCOREP_WRAPPER_INSTRUMENTER_FLAGS='--user --openmp --thread=omp --nocompiler' SCOREP_WRAPPER=off cmake .. -DCMAKE_C_COMPILER=scorep-gcc -DCMAKE_CXX_COMPILER=scorep-g++ -DUSE_SCOREP=ON -DUSE_FIRESTARTER=OFF +make SCOREP_WRAPPER_INSTRUMENTER_FLAGS='--user --openmp --thread=omp --nocompiler' +``` + +## Notes +- The tested frequencies are `3201, 3200, 2500, 1700, 800` MHz. +- According to Linux E- and P- cores support the same frequencies. +- The nominal frequency of the E-cores 2.4 GHz is not available to be selected. + +The generated `run.sh` prepares the environment. + +You may use the provided `Utility` metric to compare the performance between different runs. diff --git a/src/configurations/kallisto/experiment.cpp b/src/configurations/kallisto/experiment.cpp index a2b8cc8..45378de 100644 --- a/src/configurations/kallisto/experiment.cpp +++ b/src/configurations/kallisto/experiment.cpp @@ -24,7 +24,6 @@ #include #include #include -#include #include #include @@ -134,7 +133,6 @@ void run_experiments(roco2::chrono::time_point starting_point, bool eta_only) { roco2::kernels::busy_wait bw; roco2::kernels::compute cp; - roco2::kernels::sinus sinus; roco2::kernels::memory_read<> mem_rd; roco2::kernels::memory_copy<> mem_cpy; roco2::kernels::memory_write<> mem_wrt; @@ -156,8 +154,7 @@ void run_experiments(roco2::chrono::time_point starting_point, bool eta_only) // note that both P- and E-cores report **the same** available frequencies auto freq_list = std::vector{ 3201, 3200, 2500, 1700, 800 }; - //auto on_list = get_alderlake_patterns(); - auto on_list = {roco2::experiments::cpu_sets::all_cpus()}; + auto on_list = get_alderlake_patterns(); // ------ EDIT GENERIC SETTINGS ABOVE THIS LINE ------ @@ -165,7 +162,7 @@ void run_experiments(roco2::chrono::time_point starting_point, bool eta_only) #pragma omp master { - //roco2::log::info() << "Experiment list has " << on_list.size() << " entries: " << on_list; + roco2::log::info() << "Experiment list has " << on_list.size() << " entries: " << on_list; } #pragma omp barrier @@ -197,7 +194,6 @@ void run_experiments(roco2::chrono::time_point starting_point, bool eta_only) { experiment(bw, on); experiment(cp, on); - // experiment(sinus, on); experiment(mem_rd, on); experiment(mem_cpy, on); experiment(mem_wrt, on); diff --git a/src/configurations/kallisto/run.sh.in b/src/configurations/kallisto/run.sh.in new file mode 100644 index 0000000..19d617e --- /dev/null +++ b/src/configurations/kallisto/run.sh.in @@ -0,0 +1,23 @@ +#!/bin/sh + +module purge --force +module load toolchain/system scorep scorep_metricq + +echo "it is $(date)" + +export GOMP_CPU_AFFINITY=0-23 + +export SCOREP_ENABLE_TRACING=1 +export SCOREP_ENABLE_PROFILING=0 +export SCOREP_TOTAL_MEMORY=4095M + +echo "environment variables:" +echo "GOMP_CPU_AFFINITY = $GOMP_CPU_AFFINITY" +echo "SCOREP_ENABLE_TRACING = $SCOREP_ENABLE_TRACING" +echo "SCOREP_ENABLE_PROFILING = $SCOREP_ENABLE_PROFILING" +echo "SCOREP_TOTAL_MEMORY = $SCOREP_TOTAL_MEMORY" +echo "SCOREP_METRIC_PLUGINS = $SCOREP_METRIC_PLUGINS" + +echo "executing test..." +${CMAKE_CURRENT_BINARY_DIR}/roco2_kallisto +echo "done" From 284d574b8e54deffcbf4f12837500e7a34298fea Mon Sep 17 00:00:00 2001 From: Hannes T Date: Fri, 24 Jun 2022 13:03:54 +0200 Subject: [PATCH 4/4] kallisto: disable OpenBLAS-owned threading The following *only* applies to OpenBLAS. The BLAS invocation during the matmul kernel uses BLAS-owned threads. These are created separately, and thus are *not pinned* correctly. In our kallisto (alder lake) test system this led to *100%* CPU usage, while only 1/3rd of threads where active (E-cores only). The fix is documented in the OpenBLAS FAQ under "How can I use OpenBLAS in multi-threaded applications?", recommending setting an environment variable to disable this separate thread spawning. --- src/configurations/kallisto/README.md | 5 +++-- src/configurations/kallisto/run.sh.in | 31 ++++++++++++++++++--------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/src/configurations/kallisto/README.md b/src/configurations/kallisto/README.md index 157ffe2..40f7e34 100644 --- a/src/configurations/kallisto/README.md +++ b/src/configurations/kallisto/README.md @@ -16,14 +16,15 @@ The CPU mapping is non-standard, please refer to the comments in the source file ## Building ```bash -SCOREP_WRAPPER_INSTRUMENTER_FLAGS='--user --openmp --thread=omp --nocompiler' SCOREP_WRAPPER=off cmake .. -DCMAKE_C_COMPILER=scorep-gcc -DCMAKE_CXX_COMPILER=scorep-g++ -DUSE_SCOREP=ON -DUSE_FIRESTARTER=OFF -make SCOREP_WRAPPER_INSTRUMENTER_FLAGS='--user --openmp --thread=omp --nocompiler' +cmake .. -DUSE_SCOREP=OFF -DUSE_FIRESTARTER=OFF -DUSE_ASM_KERNELS=ON -DCMAKE_BUILD_TYPE=RelWithDebInfo +make ``` ## Notes - The tested frequencies are `3201, 3200, 2500, 1700, 800` MHz. - According to Linux E- and P- cores support the same frequencies. - The nominal frequency of the E-cores 2.4 GHz is not available to be selected. +- OpenBLAS-owned threading is disabled via an environment variable. The generated `run.sh` prepares the environment. diff --git a/src/configurations/kallisto/run.sh.in b/src/configurations/kallisto/run.sh.in index 19d617e..bd740fc 100644 --- a/src/configurations/kallisto/run.sh.in +++ b/src/configurations/kallisto/run.sh.in @@ -1,23 +1,34 @@ #!/bin/sh +source /etc/profile.d/lmod.sh +source /etc/profile.d/zih-a-lmod.sh + module purge --force -module load toolchain/system scorep scorep_metricq +module load toolchain/system scorep scorep_metricq lo2s echo "it is $(date)" +# openMP thread pinning export GOMP_CPU_AFFINITY=0-23 -export SCOREP_ENABLE_TRACING=1 -export SCOREP_ENABLE_PROFILING=0 -export SCOREP_TOTAL_MEMORY=4095M +# disable threading by openblas itself +export OPENBLAS_NUM_THREADS=1 + +# overwrite metricq timeout +export SCOREP_METRIC_METRICQ_PLUGIN_TIMEOUT=12h echo "environment variables:" -echo "GOMP_CPU_AFFINITY = $GOMP_CPU_AFFINITY" -echo "SCOREP_ENABLE_TRACING = $SCOREP_ENABLE_TRACING" -echo "SCOREP_ENABLE_PROFILING = $SCOREP_ENABLE_PROFILING" -echo "SCOREP_TOTAL_MEMORY = $SCOREP_TOTAL_MEMORY" -echo "SCOREP_METRIC_PLUGINS = $SCOREP_METRIC_PLUGINS" +echo " GOMP_CPU_AFFINITY = $GOMP_CPU_AFFINITY" +echo " SCOREP_METRIC_PLUGINS = $SCOREP_METRIC_PLUGINS" +echo " SCOREP_METRIC_METRICQ_PLUGIN_TIMEOUT = $SCOREP_METRIC_METRICQ_PLUGIN_TIMEOUT" echo "executing test..." -${CMAKE_CURRENT_BINARY_DIR}/roco2_kallisto +ulimit -n 999999 +elab frequency turbo + +perf probe -d roco2:metrics +sudo perf probe -x ./roco2_kallisto roco2:metrics=_ZN5roco27metrics4meta5writeEmmlmmmm experiment frequency shell threads utility || exit 1 + +lo2s -X -t roco2:metrics -- ${CMAKE_CURRENT_BINARY_DIR}/roco2_kallisto + echo "done"