Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alderlake expansion #10

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions include/roco2/experiments/patterns/pattern_generator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,16 @@ namespace experiments
return result;
}


// example for 24 CPUs with stride_pattern(1, 8)
// [#.......#.......#]
// [##......##......##]
// [###.....###.....###]
// [####....####....####]
// [#####...#####...#####]
// [######..######..######]
// [#######.#######.#######]
// [########################]
inline pattern stride_pattern(std::size_t block_size, std::size_t stride_size)
{
if (omp_get_num_threads() % block_size != 0)
Expand Down Expand Up @@ -197,6 +207,33 @@ namespace experiments

return result;
}

/**
* enqueues a single pattern as provided by the on_list
*/
inline pattern raw_pattern(std::vector<bool> on_list)
{
if (static_cast<std::size_t>(omp_get_num_threads()) != on_list.size())
{
roco2::log::warn() << "requested number of CPUs (" << on_list.size() << ")"
<< "does NOT match number of available threads ("
<< omp_get_num_threads()
<< ")";
}


cpu_sets::cpu_set cpu_set_range;

for (std::size_t num_cpu = 0; num_cpu < on_list.size(); num_cpu++) {
if (on_list[num_cpu]) {
cpu_set_range.add(num_cpu);
}
}

pattern result;
result.append(cpu_set_range);
return result;
}
}
}
}
Expand Down
1 change: 1 addition & 0 deletions src/configurations/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ add_subdirectory(example)
add_subdirectory(conway)
add_subdirectory(p9_highlow)
add_subdirectory(p9_longrun)
add_subdirectory(kallisto)
7 changes: 7 additions & 0 deletions src/configurations/kallisto/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
if(CpuFreq_FOUND AND USE_ASM_KERNELS)
Roco2Configuration(kallisto)

configure_file("${CMAKE_CURRENT_LIST_DIR}/run.sh.in" "${CMAKE_CURRENT_BINARY_DIR}/run.sh" ESCAPE_QUOTES)
endif()


31 changes: 31 additions & 0 deletions src/configurations/kallisto/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Kallisto
This configuration executes workloads on the different types of cores of the test alderlake CPU **separately**.

The following CPU "triangles" are tested:

1. all non-HT, then all HT, then all E-cores (each triangle separate from each other)
2. enabling non-HT, HT, E-cores subsequently (in that order, i.e. "one big triangle")
3. enable first non-HT, HT, E-core (all simultaneously), then second non-HT, HT, E-core...

The CPU mapping is non-standard, please refer to the comments in the source file for details.

## Requirements
- Score-P
- cpufreq

## Building

```bash
cmake .. -DUSE_SCOREP=OFF -DUSE_FIRESTARTER=OFF -DUSE_ASM_KERNELS=ON -DCMAKE_BUILD_TYPE=RelWithDebInfo
make
```

## Notes
- The tested frequencies are `3201, 3200, 2500, 1700, 800` MHz.
- According to Linux E- and P- cores support the same frequencies.
- The nominal frequency of the E-cores 2.4 GHz is not available to be selected.
- OpenBLAS-owned threading is disabled via an environment variable.

The generated `run.sh` prepares the environment.

You may use the provided `Utility` metric to compare the performance between different runs.
221 changes: 221 additions & 0 deletions src/configurations/kallisto/experiment.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
#include <roco2/initialize.hpp>

#include <roco2/cpu/c_state_limit.hpp>
#include <roco2/cpu/ddcm.hpp>
#include <roco2/cpu/frequency.hpp>
#include <roco2/cpu/shell.hpp>
#include <roco2/cpu/topology.hpp>

#include <roco2/memory/numa.hpp>

#include <roco2/chrono/chrono.hpp>
#include <roco2/log.hpp>

#include <roco2/experiments/const_length.hpp>
#include <roco2/experiments/patterns/pattern_generator.hpp>

#include <roco2/kernels/addpd.hpp>
#include <roco2/kernels/busy_wait.hpp>
#include <roco2/kernels/compute.hpp>
#include <roco2/kernels/high_low.hpp>
#include <roco2/kernels/idle.hpp>
#include <roco2/kernels/matmul.hpp>
#include <roco2/kernels/memory_copy.hpp>
#include <roco2/kernels/memory_read.hpp>
#include <roco2/kernels/memory_write.hpp>
#include <roco2/kernels/mulpd.hpp>
#include <roco2/kernels/sqrt.hpp>

#include <roco2/task/experiment_task.hpp>
#include <roco2/task/lambda_task.hpp>
#include <roco2/task/task_plan.hpp>

#include <string>
#include <vector>
#include <cassert>

using namespace roco2::experiments::patterns;

enum class core_type {
non_ht,
ht,
ecore,
};

/**
* returns number of linux CPU for number of core type
* e.g.
*
* | non_ht | 0 | 0 |
* | non_ht | 1 | 2 |
* | ht | 1 | 3 |
* | ht | 2 | 5 |
* | ecore | 0 | 16 |
* | ecore | 1 | 17 |
*/
static int get_linux_cpu_num(core_type cpu_type, int cpu_num) {
assert(cpu_num >= 0 && cpu_num <= 7);

switch(cpu_type) {
case core_type::non_ht:
return cpu_num * 2;
case core_type::ht:
return 1 + cpu_num * 2;
case core_type::ecore:
return 16 + cpu_num;
default:
throw std::runtime_error("unkown CPU type");
};
}

/// retrieve CPU patterns for alderlake
static pattern get_alderlake_patterns() {
// this function generates the patterns for alderlake
// due to the mapping of CPU num -> their functionality the traditional roco2 pattern generators are not applicable

// the CPUs are mapped as follows:
// 0-15 P-cores
// 16-23 E-cores
// for P-cores where **even** numbers are the cores, and the following **odd** number is the corresponding HT cpu
//
// $ lscpu --all --extended
// CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ
// 0 0 0 0 0:0:0:0 yes 3201.0000 800.0000 800.000
// 1 0 0 0 0:0:0:0 yes 3201.0000 800.0000 3201.000
// 2 0 0 1 4:4:1:0 yes 3201.0000 800.0000 800.000
// 3 0 0 1 4:4:1:0 yes 3201.0000 800.0000 3201.000
// 4 0 0 2 8:8:2:0 yes 3201.0000 800.0000 800.000
// ...
//
// this is handled by the method get_linux_cpu_num() above

pattern all_patterns;

// Part A: non-HT, HT, E-cores separate
for (core_type cpu_type : {core_type::non_ht, core_type::ht, core_type::ecore}) {
std::vector<bool> on_list(24);
// build triangle
for (int max_enabled_cpu = 0; max_enabled_cpu < 8; max_enabled_cpu++) {
// note: CPUs from previous iteration are still enabled, no need to re-build triangle every iteration
on_list[get_linux_cpu_num(cpu_type, max_enabled_cpu)] = true;

all_patterns = all_patterns >> raw_pattern(on_list);
}
}

// Part B: enabling non-HT, HT, E-cores subsequently (in that order)
{
std::vector<bool> on_list(24);
for (core_type cpu_type : {core_type::non_ht, core_type::ht, core_type::ecore}) {
for (int max_enabled_cpu = 0; max_enabled_cpu < 8; max_enabled_cpu++) {
on_list[get_linux_cpu_num(cpu_type, max_enabled_cpu)] = true;
all_patterns = all_patterns >> raw_pattern(on_list);
}
}
}

// Part C: enable first non-HT, HT, E-core, then second non-HT, HT, E-core...
{
std::vector<bool> on_list(24);
for (int max_enabled_cpu = 0; max_enabled_cpu < 8; max_enabled_cpu++) {
on_list[get_linux_cpu_num(core_type::ht, max_enabled_cpu)] = true;
on_list[get_linux_cpu_num(core_type::non_ht, max_enabled_cpu)] = true;
on_list[get_linux_cpu_num(core_type::ecore, max_enabled_cpu)] = true;

all_patterns = all_patterns >> raw_pattern(on_list);
}
}

return all_patterns;
}

void run_experiments(roco2::chrono::time_point starting_point, bool eta_only)
{
roco2::kernels::busy_wait bw;
roco2::kernels::compute cp;
roco2::kernels::memory_read<> mem_rd;
roco2::kernels::memory_copy<> mem_cpy;
roco2::kernels::memory_write<> mem_wrt;
roco2::kernels::sqrt squareroot;
roco2::kernels::matmul mm;
roco2::kernels::idle idle;
roco2::kernels::mulpd mulpd;
roco2::kernels::addpd addpd;

roco2::cpu::frequency freqctl;

roco2::memory::numa_bind_local nbl;

// ------ EDIT GENERIC SETTINGS BELOW THIS LINE ------

auto experiment_duration = std::chrono::seconds(10);

// copied from: /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies
// note that both P- and E-cores report **the same** available frequencies
auto freq_list = std::vector<unsigned>{ 3201, 3200, 2500, 1700, 800 };

auto on_list = get_alderlake_patterns();

// ------ EDIT GENERIC SETTINGS ABOVE THIS LINE ------

roco2::task::task_plan plan;

#pragma omp master
{
roco2::log::info() << "Experiment list has " << on_list.size() << " entries: " << on_list;
}

#pragma omp barrier

auto experiment_startpoint =
roco2::initialize::thread(starting_point, experiment_duration, eta_only);

roco2::experiments::const_lenght exp(experiment_startpoint, experiment_duration);

auto experiment = [&](auto& kernel, const auto& on) {
plan.push_back(roco2::task::experiment_task(exp, kernel, on));
};

auto setting = [&](auto lambda) { plan.push_back(roco2::task::make_lambda_task(lambda)); };

// ------ EDIT TASK PLAN BELOW THIS LINE ------

setting([&freqctl, &freq_list]() { freqctl.change(freq_list[0]); });

// for each frequency
for (const auto& freq : freq_list)
{
setting([&freqctl, freq]() { freqctl.change(freq); });

// for reference
experiment(idle, roco2::experiments::cpu_sets::all_cpus());

for (const auto& on : on_list)
{
experiment(bw, on);
experiment(cp, on);
experiment(mem_rd, on);
experiment(mem_cpy, on);
experiment(mem_wrt, on);
experiment(addpd, on);
experiment(mulpd, on);
experiment(squareroot, on);
experiment(mm, on);
}
}

// ------ EDIT TASK PLAN ABOVE THIS LINE ------

#pragma omp master
{
roco2::log::info() << "ETA for whole execution: "
<< std::chrono::duration_cast<std::chrono::seconds>(plan.eta());
}

if (!eta_only)
{
#pragma omp barrier

plan.execute();
}
}
34 changes: 34 additions & 0 deletions src/configurations/kallisto/run.sh.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/sh

source /etc/profile.d/lmod.sh
source /etc/profile.d/zih-a-lmod.sh

module purge --force
module load toolchain/system scorep scorep_metricq lo2s

echo "it is $(date)"

# openMP thread pinning
export GOMP_CPU_AFFINITY=0-23

# disable threading by openblas itself
export OPENBLAS_NUM_THREADS=1

# overwrite metricq timeout
export SCOREP_METRIC_METRICQ_PLUGIN_TIMEOUT=12h

echo "environment variables:"
echo " GOMP_CPU_AFFINITY = $GOMP_CPU_AFFINITY"
echo " SCOREP_METRIC_PLUGINS = $SCOREP_METRIC_PLUGINS"
echo " SCOREP_METRIC_METRICQ_PLUGIN_TIMEOUT = $SCOREP_METRIC_METRICQ_PLUGIN_TIMEOUT"

echo "executing test..."
ulimit -n 999999
elab frequency turbo

perf probe -d roco2:metrics
sudo perf probe -x ./roco2_kallisto roco2:metrics=_ZN5roco27metrics4meta5writeEmmlmmmm experiment frequency shell threads utility || exit 1

lo2s -X -t roco2:metrics -- ${CMAKE_CURRENT_BINARY_DIR}/roco2_kallisto

echo "done"