From 7e979f50ddaa52767c53dc042e670a45a8c39ba6 Mon Sep 17 00:00:00 2001 From: romnnn Date: Fri, 1 Sep 2023 03:24:13 +0200 Subject: [PATCH] validate: profile CUDA commands using nvprof --- Cargo.lock | 3 + Pipfile | 4 +- WIP.md | 11 +- accelsim/src/stats.rs | 12 +- benches/vectoradd.rs | 4 +- gpucachesim/__init__.py | 3 + gpucachesim/benchmarks.py | 42 ++ gpucachesim/stats/__init__.py | 27 + gpucachesim/stats/accelsim.py | 2 + gpucachesim/stats/native.py | 6 + gpucachesim/stats/stats.py | 6 + notebooks/plots.ipynb | 591 +++++++++++++++++- playground/sys/src/ref/memory_stats.hpp | 2 - profile/Cargo.toml | 2 + profile/src/lib.rs | 23 +- profile/src/main.rs | 70 ++- profile/src/nsight/metrics.rs | 5 + profile/src/nsight/mod.rs | 10 + profile/src/nvprof/metrics.rs | 44 ++ profile/src/nvprof/mod.rs | 309 ++++++--- .../nvprof_vectoradd_100_32_commands.txt | 10 + stats/Cargo.toml | 1 + stats/src/cache.rs | 5 +- stats/src/dram.rs | 120 +++- test-apps/test-apps-materialized.yml | 2 +- utils/src/lib.rs | 13 + validate/src/accelsim.rs | 12 +- validate/src/benchmark/matrix.rs | 3 - validate/src/options.rs | 7 +- validate/src/playground.rs | 9 +- validate/src/profile.rs | 34 +- validate/src/simulate.rs | 12 +- validate/src/stats.rs | 23 +- 33 files changed, 1242 insertions(+), 185 deletions(-) create mode 100644 gpucachesim/__init__.py create mode 100644 gpucachesim/benchmarks.py create mode 100644 gpucachesim/stats/__init__.py create mode 100644 gpucachesim/stats/accelsim.py create mode 100644 gpucachesim/stats/native.py create mode 100644 gpucachesim/stats/stats.py create mode 100644 profile/src/nsight/metrics.rs create mode 100644 profile/src/nsight/mod.rs create mode 100755 profile/tests/nvprof_vectoradd_100_32_commands.txt diff --git a/Cargo.lock b/Cargo.lock index e4436678..177adcc6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2828,12 +2828,14 @@ dependencies = [ "clap", "color-eyre", "csv", + "indexmap 2.0.0", "log", "once_cell", "pretty_assertions_sorted", "regex", "serde", "serde_json", + "similar-asserts", "tempfile", "thiserror", "tokio", @@ -3588,6 +3590,7 @@ dependencies = [ "indexmap 2.0.0", "serde", "strum", + "utils", ] [[package]] diff --git a/Pipfile b/Pipfile index 47eb2075..68b6a32a 100644 --- a/Pipfile +++ b/Pipfile @@ -11,10 +11,12 @@ matplotlib = "*" numpy = "*" scipy = "*" pandas = "*" -jupyterlab = "*" wasabi = "*" +click = "*" [dev-packages] invoke = "*" flake8 = "*" black = "*" +jupyterlab = "*" +mypy = "*" diff --git a/WIP.md b/WIP.md index c81f933a..ed54c181 100644 --- a/WIP.md +++ b/WIP.md @@ -1,16 +1,12 @@ #### TODO -- notes - - - l2 cache is shared in sub partitions, we get different l2 metrics - - l1i are the same always, as they are local to the core - - today: - - record mem fetch latency in playground and box - - write trait for tag array + - convert, match and plot statistics - rename crates and github repo - publish to crates.io + - record mem fetch latency in playground and box + - DONE: write trait for tag array - TODO: @@ -21,7 +17,6 @@ - use traits for common components - record mem fetch latency - add a few more stats - - plot statistics - publish python package to pip - tomorrow: diff --git a/accelsim/src/stats.rs b/accelsim/src/stats.rs index 79ab0832..efab47db 100644 --- a/accelsim/src/stats.rs +++ b/accelsim/src/stats.rs @@ -1,5 +1,6 @@ use color_eyre::eyre; use std::collections::HashMap; +use utils::box_slice; pub type Stat = (String, u16, String); pub type Map = indexmap::IndexMap; @@ -102,7 +103,7 @@ impl TryFrom for stats::Stats { .into_iter() .collect(); - let mut l2d_stats = stats::PerCache(vec![stats::Cache::default(); 1].into_boxed_slice()); + let mut l2d_stats = stats::PerCache(box_slice![stats::Cache::default(); 1]); let l2d_total = &mut l2d_stats[0]; for kind in AccessKind::iter() { for reservation_failure in ReservationFailure::iter() { @@ -160,10 +161,11 @@ impl TryFrom for stats::Stats { }, accesses: stats::Accesses(accesses), dram: stats::DRAM { - bank_writes: vec![vec![vec![total_dram_writes]]], - bank_reads: vec![vec![vec![total_dram_reads]]], - total_bank_writes: vec![vec![total_dram_writes]], - total_bank_reads: vec![vec![total_dram_reads]], + bank_writes: box_slice![box_slice![box_slice![total_dram_writes]]], + bank_reads: box_slice![box_slice![box_slice![total_dram_reads]]], + total_bank_writes: box_slice![box_slice![total_dram_writes]], + total_bank_reads: box_slice![box_slice![total_dram_reads]], + ..stats::DRAM::default() }, instructions: stats::InstructionCounts::default(), l1i_stats: stats::PerCache::new(0), diff --git a/benches/vectoradd.rs b/benches/vectoradd.rs index f1333407..32a0eb1b 100644 --- a/benches/vectoradd.rs +++ b/benches/vectoradd.rs @@ -111,10 +111,10 @@ pub fn box_benchmark(c: &mut Criterion) { } criterion::criterion_group!(benches, box_benchmark, play_benchmark, accelsim_benchmark); -// criterion::criterion_main!(benches); +criterion::criterion_main!(benches); #[allow(dead_code)] -fn main() -> eyre::Result<()> { +fn main_other() -> eyre::Result<()> { use itertools::Itertools; #[allow(unused_imports)] use std::io::Write; diff --git a/gpucachesim/__init__.py b/gpucachesim/__init__.py new file mode 100644 index 00000000..caceb2e4 --- /dev/null +++ b/gpucachesim/__init__.py @@ -0,0 +1,3 @@ +from pathlib import Path + +ROOT_DIR = Path(__file__).parent diff --git a/gpucachesim/benchmarks.py b/gpucachesim/benchmarks.py new file mode 100644 index 00000000..a58dc494 --- /dev/null +++ b/gpucachesim/benchmarks.py @@ -0,0 +1,42 @@ +import click +import yaml +from pathlib import Path +from os import PathLike +from typing import Optional + +from gpucachesim import ROOT_DIR + +REPO_ROOT_DIR = ROOT_DIR.parent +DEFAULT_BENCH_FILE = REPO_ROOT_DIR / "test-apps/test-apps-materialized.yml" + + +class Benchmarks: + def __init__(self, path: PathLike) -> None: + """load the materialized benchmark config""" + + with open(path or DEFAULT_BENCH_FILE, "rb") as f: + benchmarks = yaml.safe_load(f) + + self.benchmarks = benchmarks["benchmarks"] + + def __getitem__(self, bench_name: str): + return self.benchmarks[bench_name] + + def get_bench_config(self, bench_name: str, input_idx: int): + return self.benchmarks[bench_name][input_idx] + + +@click.command() +@click.option("--path", default=DEFAULT_BENCH_FILE, help="Path to materialized benchmark config") +def main(path): + from pprint import pprint + + print(path) + b = Benchmarks(path) + + benchmark_names = list(b.benchmarks.keys()) + pprint(benchmark_names) + + +if __name__ == "__main__": + main() diff --git a/gpucachesim/stats/__init__.py b/gpucachesim/stats/__init__.py new file mode 100644 index 00000000..c67a3682 --- /dev/null +++ b/gpucachesim/stats/__init__.py @@ -0,0 +1,27 @@ +import click + +import gpucachesim.stats.stats as stats +import gpucachesim.stats.native as native +from gpucachesim.benchmarks import Benchmarks + + +@click.command() +@click.option("--path", help="Path to materialized benchmark config") +@click.option("--bench", help="Benchmark name") +@click.option("--input", default=0, help="Input index") +def main(path, bench, input): + from pprint import pprint + + b = Benchmarks(path) + if bench is None: + raise NotImplemented + print(bench, input) + bench_config = b.get_bench_config(bench, input) + # pprint(bench_config) + + our_stats = stats.Stats(bench_config["simulate"]) + native_stats = native.Stats(bench_config["simulate"]) + + +if __name__ == "__main__": + main() diff --git a/gpucachesim/stats/accelsim.py b/gpucachesim/stats/accelsim.py new file mode 100644 index 00000000..c8fe4366 --- /dev/null +++ b/gpucachesim/stats/accelsim.py @@ -0,0 +1,2 @@ +class AccelsimStats: + pass diff --git a/gpucachesim/stats/native.py b/gpucachesim/stats/native.py new file mode 100644 index 00000000..b6300e48 --- /dev/null +++ b/gpucachesim/stats/native.py @@ -0,0 +1,6 @@ +from os import PathLike + + +class Stats: + def __init__(self, result_dir: PathLike) -> None: + self.path = result_dir diff --git a/gpucachesim/stats/stats.py b/gpucachesim/stats/stats.py new file mode 100644 index 00000000..b6300e48 --- /dev/null +++ b/gpucachesim/stats/stats.py @@ -0,0 +1,6 @@ +from os import PathLike + + +class Stats: + def __init__(self, result_dir: PathLike) -> None: + self.path = result_dir diff --git a/notebooks/plots.ipynb b/notebooks/plots.ipynb index dc2ee541..bc9eac3e 100644 --- a/notebooks/plots.ipynb +++ b/notebooks/plots.ipynb @@ -25,11 +25,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "['vectorAdd', 'simple_matrixmul']\n" + "['vectorAdd', 'simple_matrixmul', 'matrixmul', 'transpose']\n" ] } ], "source": [ + "# load the materialized benchmark config\n", "benchmark_file = \"../test-apps/test-apps-materialized.yml\"\n", "with open(benchmark_file, \"rb\") as f:\n", " benchmarks = yaml.safe_load(f)\n", @@ -46,6 +47,7 @@ "metadata": {}, "outputs": [], "source": [ + "# define targets to use\n", "targets = {\n", " \"accelsim_simulate\": \"\",\n", " \"simulate\": \"\",\n", @@ -58,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "165b6c35-95c7-458f-9f4e-3b25d21122bd", "metadata": {}, "outputs": [ @@ -68,14 +70,69 @@ "text": [ "[(0, '/home/roman/dev/box/test-apps/vectoradd/vectoradd', ['100', '32']),\n", " (1, '/home/roman/dev/box/test-apps/vectoradd/vectoradd', ['1000', '32']),\n", - " (2, '/home/roman/dev/box/test-apps/vectoradd/vectoradd', ['10000', '32'])]\n" + " (2, '/home/roman/dev/box/test-apps/vectoradd/vectoradd', ['10000', '32'])]\n", + "{'accelsim_simulate': {'concurrency': None,\n", + " 'config': '/home/roman/dev/box/accelsim/gtx1080/gpgpusim.config',\n", + " 'config_dir': '/home/roman/dev/box/accelsim/gtx1080',\n", + " 'enabled': True,\n", + " 'inter_config': '/home/roman/dev/box/accelsim/gtx1080/config_fermi_islip.icnt',\n", + " 'repetitions': 2,\n", + " 'results_dir': '/home/roman/dev/box/results',\n", + " 'stats_dir': '/home/roman/dev/box/results/vectorAdd/vectorAdd-dtype-32-length-100/accelsim-sim',\n", + " 'timeout': None,\n", + " 'trace_config': '/home/roman/dev/box/accelsim/gtx1080/gpgpusim.trace.config'},\n", + " 'accelsim_trace': {'concurrency': 1,\n", + " 'enabled': True,\n", + " 'repetitions': 1,\n", + " 'results_dir': '/home/roman/dev/box/results',\n", + " 'timeout': None,\n", + " 'traces_dir': '/home/roman/dev/box/results/vectorAdd/vectorAdd-dtype-32-length-100/accelsim-trace'},\n", + " 'args': ['100', '32'],\n", + " 'benchmark_idx': 0,\n", + " 'executable': '/home/roman/dev/box/test-apps/vectoradd/vectoradd',\n", + " 'input_idx': 0,\n", + " 'name': 'vectorAdd',\n", + " 'path': '/home/roman/dev/box/test-apps/vectoradd',\n", + " 'playground_simulate': {'concurrency': 1,\n", + " 'config': '/home/roman/dev/box/accelsim/gtx1080/gpgpusim.config',\n", + " 'config_dir': '/home/roman/dev/box/accelsim/gtx1080',\n", + " 'enabled': True,\n", + " 'inter_config': '/home/roman/dev/box/accelsim/gtx1080/config_fermi_islip.icnt',\n", + " 'repetitions': 2,\n", + " 'results_dir': '/home/roman/dev/box/results',\n", + " 'stats_dir': '/home/roman/dev/box/results/vectorAdd/vectorAdd-dtype-32-length-100/playground-sim',\n", + " 'timeout': None,\n", + " 'trace_config': '/home/roman/dev/box/accelsim/gtx1080/gpgpusim.trace.config'},\n", + " 'profile': {'concurrency': 1,\n", + " 'enabled': True,\n", + " 'profile_dir': '/home/roman/dev/box/results/vectorAdd/vectorAdd-dtype-32-length-100/profile',\n", + " 'repetitions': 5,\n", + " 'results_dir': '/home/roman/dev/box/results',\n", + " 'timeout': None},\n", + " 'simulate': {'concurrency': None,\n", + " 'enabled': True,\n", + " 'parallel': False,\n", + " 'repetitions': 2,\n", + " 'results_dir': '/home/roman/dev/box/results',\n", + " 'stats_dir': '/home/roman/dev/box/results/vectorAdd/vectorAdd-dtype-32-length-100/sim',\n", + " 'timeout': None},\n", + " 'trace': {'concurrency': 1,\n", + " 'enabled': True,\n", + " 'full_trace': False,\n", + " 'repetitions': 1,\n", + " 'results_dir': '/home/roman/dev/box/results',\n", + " 'save_json': True,\n", + " 'timeout': None,\n", + " 'traces_dir': '/home/roman/dev/box/results/vectorAdd/vectorAdd-dtype-32-length-100/trace'},\n", + " 'values': {'dtype': 32, 'length': 100}}\n" ] } ], "source": [ + "# check all benchmark configs for vectoradd\n", "vectoradd = benchmarks[\"vectorAdd\"]\n", "pprint([(b[\"input_idx\"], b[\"executable\"], b[\"args\"]) for b in vectoradd])\n", - "# pprint(vectoradd[0])" + "pprint(vectoradd[0])" ] }, { @@ -202,6 +259,147 @@ "sim_df" ] }, + { + "cell_type": "code", + "execution_count": 18, + "id": "82d165ef-3b69-4ca6-b845-b74afa499946", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chip_idbank_idreadswrites
00040
10140
\n", + "
" + ], + "text/plain": [ + " chip_id bank_id reads writes\n", + "0 0 0 4 0\n", + "1 0 1 4 0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dram_df = pd.read_csv(Path(benchmarks[\"vectorAdd\"][0][\"simulate\"][\"stats_dir\"]) / \"stats.dram.csv\")\n", + "dram_total = dram_df[\"reads\"] + dram_df[\"writes\"]\n", + "dram_df[dram_total > 0]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "dd0edc19-ca39-44d8-b01f-9a771b871d35", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
core_idchip_idbank_idreadswrites
000010
100110
\n", + "
" + ], + "text/plain": [ + " core_id chip_id bank_id reads writes\n", + "0 0 0 0 1 0\n", + "1 0 0 1 1 0" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dram_banks_df = pd.read_csv(Path(benchmarks[\"vectorAdd\"][0][\"simulate\"][\"stats_dir\"]) / \"stats.dram.banks.csv\")\n", + "dram_banks_total = dram_banks_df[\"reads\"] + dram_banks_df[\"writes\"]\n", + "dram_banks_df[dram_banks_total > 0]" + ] + }, { "cell_type": "code", "execution_count": 33, @@ -523,6 +721,389 @@ "accel_stats_df" ] }, + { + "cell_type": "code", + "execution_count": 28, + "id": "23fdd9a9-ecd4-4103-becb-291c924855e1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
valueunit
DeviceNVIDIA GeForce GTX 1080 (0)None
Context1.0NaN
Stream7.0NaN
Kernel_Z6vecAddIfEvPT_S1_S1_iNone
Correlation_ID1.0NaN
shared_load_transactions_per_request0.0NaN
shared_store_transactions_per_request0.0NaN
local_load_transactions_per_request0.0NaN
local_store_transactions_per_request0.0NaN
gld_transactions_per_request13.25NaN
gst_transactions_per_request3.25NaN
shared_store_transactions0.0NaN
shared_load_transactions0.0NaN
local_load_transactions0.0NaN
local_store_transactions0.0NaN
gld_transactions106.0NaN
gst_transactions13.0NaN
sysmem_read_transactions0.0NaN
sysmem_write_transactions5.0NaN
l2_read_transactions660.0NaN
l2_write_transactions26.0NaN
atomic_transactions0.0NaN
atomic_transactions_per_request0.0NaN
l2_global_load_bytes832.0NaN
l2_local_load_bytes0.0NaN
l2_surface_load_bytes0.0NaN
l2_local_global_store_bytes416.0NaN
l2_global_reduction_bytes0.0NaN
l2_global_atomic_store_bytes0.0NaN
l2_surface_store_bytes0.0NaN
l2_surface_reduction_bytes0.0NaN
l2_surface_atomic_store_bytes0.0NaN
global_load_requests26.0NaN
local_load_requests0.0NaN
surface_load_requests0.0NaN
global_store_requests13.0NaN
local_store_requests0.0NaN
surface_store_requests0.0NaN
global_atomic_requests0.0NaN
global_reduction_requests0.0NaN
surface_atomic_requests0.0NaN
surface_reduction_requests0.0NaN
dram_read_transactions72.0NaN
dram_write_transactions12.0NaN
dram_read_throughput1.151163GB/s
dram_write_throughput196.46509MB/s
dram_write_bytes384.0NaN
dram_read_bytes2304.0NaN
\n", + "
" + ], + "text/plain": [ + " value unit\n", + "Device NVIDIA GeForce GTX 1080 (0) None\n", + "Context 1.0 NaN\n", + "Stream 7.0 NaN\n", + "Kernel _Z6vecAddIfEvPT_S1_S1_i None\n", + "Correlation_ID 1.0 NaN\n", + "shared_load_transactions_per_request 0.0 NaN\n", + "shared_store_transactions_per_request 0.0 NaN\n", + "local_load_transactions_per_request 0.0 NaN\n", + "local_store_transactions_per_request 0.0 NaN\n", + "gld_transactions_per_request 13.25 NaN\n", + "gst_transactions_per_request 3.25 NaN\n", + "shared_store_transactions 0.0 NaN\n", + "shared_load_transactions 0.0 NaN\n", + "local_load_transactions 0.0 NaN\n", + "local_store_transactions 0.0 NaN\n", + "gld_transactions 106.0 NaN\n", + "gst_transactions 13.0 NaN\n", + "sysmem_read_transactions 0.0 NaN\n", + "sysmem_write_transactions 5.0 NaN\n", + "l2_read_transactions 660.0 NaN\n", + "l2_write_transactions 26.0 NaN\n", + "atomic_transactions 0.0 NaN\n", + "atomic_transactions_per_request 0.0 NaN\n", + "l2_global_load_bytes 832.0 NaN\n", + "l2_local_load_bytes 0.0 NaN\n", + "l2_surface_load_bytes 0.0 NaN\n", + "l2_local_global_store_bytes 416.0 NaN\n", + "l2_global_reduction_bytes 0.0 NaN\n", + "l2_global_atomic_store_bytes 0.0 NaN\n", + "l2_surface_store_bytes 0.0 NaN\n", + "l2_surface_reduction_bytes 0.0 NaN\n", + "l2_surface_atomic_store_bytes 0.0 NaN\n", + "global_load_requests 26.0 NaN\n", + "local_load_requests 0.0 NaN\n", + "surface_load_requests 0.0 NaN\n", + "global_store_requests 13.0 NaN\n", + "local_store_requests 0.0 NaN\n", + "surface_store_requests 0.0 NaN\n", + "global_atomic_requests 0.0 NaN\n", + "global_reduction_requests 0.0 NaN\n", + "surface_atomic_requests 0.0 NaN\n", + "surface_reduction_requests 0.0 NaN\n", + "dram_read_transactions 72.0 NaN\n", + "dram_write_transactions 12.0 NaN\n", + "dram_read_throughput 1.151163 GB/s\n", + "dram_write_throughput 196.46509 MB/s\n", + "dram_write_bytes 384.0 NaN\n", + "dram_read_bytes 2304.0 NaN" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "native_df = pd.read_json(Path(benchmarks[\"vectorAdd\"][0][\"profile\"][\"profile_dir\"]) / \"profile.metrics.json\").T\n", + "native_df" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "e109b979-c2f4-4386-9989-b95bded18124", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'hw_cycle_df' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[40], line 7\u001b[0m\n\u001b[1;32m 4\u001b[0m native_commands_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame\u001b[38;5;241m.\u001b[39mfrom_dict([{k: v[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalue\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m e\u001b[38;5;241m.\u001b[39mitems()} \u001b[38;5;28;01mfor\u001b[39;00m e \u001b[38;5;129;01min\u001b[39;00m commands_json])\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# , header=None, names=[\"kernel\", \"kernel_id\", \"stat\", \"value\"])\u001b[39;00m\n\u001b[0;32m----> 7\u001b[0m native_commands_df \u001b[38;5;241m=\u001b[39m native_commands_df[\u001b[38;5;241m~\u001b[39m\u001b[43mhw_cycle_df\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCorrelation_ID\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39misnull()]\n\u001b[1;32m 8\u001b[0m \u001b[38;5;66;03m# remove memcopies\u001b[39;00m\n\u001b[1;32m 9\u001b[0m native_commands_df \u001b[38;5;241m=\u001b[39m native_commands_df[\u001b[38;5;241m~\u001b[39mhw_cycle_df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mName\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124m[CUDA memcpy .*\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m)]\n", + "\u001b[0;31mNameError\u001b[0m: name 'hw_cycle_df' is not defined" + ] + } + ], + "source": [ + "with open(Path(benchmarks[\"vectorAdd\"][0][\"profile\"][\"profile_dir\"]) / \"profile.commands.json\", \"rb\") as f:\n", + " commands_json = json.load(f)\n", + "# print(commands_json)\n", + "native_commands_df = pd.DataFrame.from_dict([{k: v[\"value\"] for k, v in e.items()} for e in commands_json])\n", + "if True:\n", + " # , header=None, names=[\"kernel\", \"kernel_id\", \"stat\", \"value\"])\n", + " native_commands_df = native_commands_df[~native_commands_df[\"Correlation_ID\"].isnull()]\n", + " # remove memcopies\n", + " native_commands_df = native_commands_df[~native_commands_df[\"Name\"].str.contains(r\"\\[CUDA memcpy .*\\]\")]\n", + " # name refers to kernels now\n", + " native_commands_df = native_commands_df.rename(columns={\"Name\": \"Kernel\"})\n", + " # remove columns that are only relevant for memcopies\n", + " # df = df.loc[:,df.notna().any(axis=0)]\n", + " native_commands_df = native_commands_df.drop(columns=[\"Size\", \"Throughput\", \"SrcMemType\", \"DstMemType\"])\n", + " # set the correct dtypes\n", + " native_commands_df = native_commands_df.astype({\n", + " \"Start\": \"float64\",\n", + " \"Duration\": \"float64\",\n", + " \"Static SMem\": \"float64\",\n", + " \"Dynamic SMem\": \"float64\",\n", + " \"Device\": \"string\",\n", + " \"Kernel\": \"string\",\n", + " })\n", + "native_commands_df" + ] + }, { "cell_type": "code", "execution_count": 35, @@ -800,7 +1381,7 @@ " 'l2_cache_texture_read_mshr_hit',\n", " 'l2_cache_texture_read_reservation_fail',\n", " 'l2_cache_texture_read_sector_miss',\n", - "'num_dram_full_stalls',\n", + " 'num_dram_full_stalls',\n", " 'num_global_mem_read',\n", " 'num_global_mem_write',\n", " 'num_interconn_to_shared_mem_stalls',\n", diff --git a/playground/sys/src/ref/memory_stats.hpp b/playground/sys/src/ref/memory_stats.hpp index 3efb06cb..e82a0fb3 100644 --- a/playground/sys/src/ref/memory_stats.hpp +++ b/playground/sys/src/ref/memory_stats.hpp @@ -1,7 +1,5 @@ #pragma once -// #include - #include "memory_config.hpp" #include "shader_core_config.hpp" diff --git a/profile/Cargo.toml b/profile/Cargo.toml index 4e9a44e0..f5b0efbf 100644 --- a/profile/Cargo.toml +++ b/profile/Cargo.toml @@ -21,6 +21,7 @@ which = "4" csv = "1" serde = { version = "1", features = ["derive"] } serde_json = "1" +indexmap = { version = "2", features = ["serde"] } regex = "1" log = "0" tokio = { version = "1", features = ["full"] } @@ -32,3 +33,4 @@ utils = { path = "../utils" } [dev-dependencies] pretty_assertions_sorted = "1" +similar-asserts = "1" diff --git a/profile/src/lib.rs b/profile/src/lib.rs index 7f54d839..48a59b5b 100644 --- a/profile/src/lib.rs +++ b/profile/src/lib.rs @@ -1,9 +1,9 @@ #![allow(clippy::missing_panics_doc, clippy::missing_errors_doc)] -// #![allow(warnings)] +pub mod nsight; pub mod nvprof; use serde::Deserialize; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; #[derive(thiserror::Error, Debug)] pub enum ParseError { @@ -94,8 +94,19 @@ where } } -#[derive(PartialEq, Clone, Debug, Default, serde::Serialize, serde::Deserialize)] -pub struct ProfilingResult { - pub raw: String, - pub metrics: M, +#[derive(PartialEq, Clone, Debug, serde::Serialize, serde::Deserialize)] +pub enum Metrics { + /// Nvprof profiler metrics + Nvprof(nvprof::Output), + /// Nsight profiler metrics + Nsight(nsight::Output), +} + +/// Profile test application using either the nvprof or nsight compute profiler. +pub async fn nvprof(executable: impl AsRef, args: A) -> Result +where + A: Clone + IntoIterator, + ::Item: AsRef, +{ + unimplemented!() } diff --git a/profile/src/main.rs b/profile/src/main.rs index 8a093704..342aa6f5 100644 --- a/profile/src/main.rs +++ b/profile/src/main.rs @@ -1,6 +1,6 @@ use color_eyre::eyre; -use clap::{CommandFactory, Parser}; +use clap::{CommandFactory, Parser, Subcommand}; use std::path::PathBuf; const HELP_TEMPLATE: &str = "{bin} {version} {author} @@ -12,7 +12,36 @@ USAGE: {usage} {all-args} "; -const USAGE: &str = "./profile [OPTIONS] -- [args]"; +const USAGE: &str = "./profile [nvprof|nsight|auto] [OPTIONS] -- [args]"; + +/// Options for the nvprof profiler. +#[derive(Parser, Debug, Clone)] +pub struct NvprofOptions { + #[clap(long = "log-file", help = "output log file")] + pub log_file: Option, +} + +impl From for profile::nvprof::Options { + fn from(_options: NvprofOptions) -> Self { + Self {} + } +} + +/// Options for the nsight profiler. +#[derive(Parser, Debug, Clone)] +pub struct NsightOptions { + #[clap(long = "log-file", help = "output log file")] + pub log_file: Option, +} + +#[derive(Subcommand, Debug, Clone)] +pub enum Command { + Auto, + /// Profile using `nvprof` + Nvprof(NvprofOptions), + /// Profile using `nsight-compute` + Nsight(NsightOptions), +} #[derive(Parser, Debug, Clone)] #[clap( @@ -23,10 +52,11 @@ const USAGE: &str = "./profile [OPTIONS] -- [args]"; author = "romnn ", )] pub struct Options { - #[clap(long = "log-file", help = "output log file")] - pub log_file: Option, #[clap(long = "metrics-file", help = "output metrics file")] pub metrics_file: Option, + + #[clap(subcommand)] + pub command: Option, } fn parse_args() -> Result<(PathBuf, Vec, Options), clap::Error> { @@ -58,23 +88,29 @@ async fn main() -> eyre::Result<()> { let start = std::time::Instant::now(); - let (exec, exec_args, _options) = match parse_args() { + let (exec, exec_args, options) = match parse_args() { Ok(parsed) => parsed, Err(err) => err.exit(), }; - let options = profile::nvprof::Options {}; - let profile::ProfilingResult { .. } = profile::nvprof::nvprof(exec, exec_args, &options) - .await - .map_err(|err| match err { - profile::Error::Command(err) => err.into_eyre(), - other => other.into(), - })?; - - // todo: nice table view of the most important things - // todo: dump the raw output - // todo: dump the parsed output as json - // println!("{:#?}", &metrics); + let _output = match options.command { + None | Some(Command::Auto) => todo!(), + Some(Command::Nvprof(nvprof_options)) => { + let output = profile::nvprof::nvprof(exec, exec_args, &nvprof_options.into()) + .await + .map_err(|err| match err { + profile::Error::Command(err) => err.into_eyre(), + other => other.into(), + })?; + profile::Metrics::Nvprof(output) + } + Some(Command::Nsight(_nsight_options)) => todo!(), + }; + + // TODO: nice table view of the most important things + // TODO: dump the raw output + // TODO: dump the parsed output as json + // println!("{:#?}", &output.metrics); println!("profiling done in {:?}", start.elapsed()); Ok(()) } diff --git a/profile/src/nsight/metrics.rs b/profile/src/nsight/metrics.rs new file mode 100644 index 00000000..211162e2 --- /dev/null +++ b/profile/src/nsight/metrics.rs @@ -0,0 +1,5 @@ +#[derive(PartialEq, Clone, Debug, Default, serde::Serialize, serde::Deserialize)] +pub struct Metrics { + // #[serde(rename = "Device")] + // pub device: Metric, +} diff --git a/profile/src/nsight/mod.rs b/profile/src/nsight/mod.rs new file mode 100644 index 00000000..aafdb828 --- /dev/null +++ b/profile/src/nsight/mod.rs @@ -0,0 +1,10 @@ +mod metrics; + +pub use metrics::Metrics; + +#[derive(PartialEq, Clone, Debug, Default, serde::Serialize, serde::Deserialize)] +pub struct Output { + // pub raw_metrics_log: String, + // pub raw_commands_log: String, + // pub metrics: Metrics, +} diff --git a/profile/src/nvprof/metrics.rs b/profile/src/nvprof/metrics.rs index f7b1f33e..8f32aa4f 100644 --- a/profile/src/nvprof/metrics.rs +++ b/profile/src/nvprof/metrics.rs @@ -188,3 +188,47 @@ pub struct Metrics { pub dram_read_bytes: Metric, // TEMP } + +#[derive(PartialEq, Clone, Debug, Default, serde::Serialize, serde::Deserialize)] +pub struct Command { + #[serde(rename = "Start")] + pub start: Metric, + #[serde(rename = "Duration")] + pub duration: Metric, + #[serde(rename = "Grid X", default)] + pub grid_x: Metric, + #[serde(rename = "Grid Y", default)] + pub grid_y: Metric, + #[serde(rename = "Grid Z", default)] + pub grid_z: Metric, + #[serde(rename = "Block X", default)] + pub block_x: Metric, + #[serde(rename = "Block Y", default)] + pub block_y: Metric, + #[serde(rename = "Block Z", default)] + pub block_z: Metric, + #[serde(rename = "Registers Per Thread")] + pub registers_per_thread: Metric, + #[serde(rename = "Static SMem")] + pub static_shared_memory: Metric, + #[serde(rename = "Dynamic SMem")] + pub dynamic_shared_memory: Metric, + #[serde(rename = "Size")] + pub size: Metric, + #[serde(rename = "Throughput")] + pub throughput: Metric, + #[serde(rename = "SrcMemType")] + pub src_mem_type: Metric, + #[serde(rename = "DstMemType")] + pub dest_mem_type: Metric, + #[serde(rename = "Device")] + pub device: Metric, + #[serde(rename = "Context")] + pub context: Metric, + #[serde(rename = "Stream")] + pub stream: Metric, + #[serde(rename = "Name")] + pub name: Metric, + #[serde(rename = "Correlation_ID")] + pub correlation_id: Metric, +} diff --git a/profile/src/nvprof/mod.rs b/profile/src/nvprof/mod.rs index 8e5825ff..ba39abc0 100644 --- a/profile/src/nvprof/mod.rs +++ b/profile/src/nvprof/mod.rs @@ -1,16 +1,21 @@ mod metrics; -use async_process::Command; use once_cell::sync::Lazy; use regex::Regex; use std::collections::HashMap; -use std::io::{BufRead, Read, Seek}; +use std::io::{BufRead, Read}; use std::path::Path; use crate::{Error, Metric, ParseError}; -pub use metrics::Metrics; +pub use metrics::{Command, Metrics}; -pub type ProfilingResult = super::ProfilingResult; +#[derive(PartialEq, Clone, Debug, Default, serde::Serialize, serde::Deserialize)] +pub struct Output { + pub raw_metrics_log: String, + pub raw_commands_log: String, + pub metrics: Metrics, + pub commands: Vec, +} macro_rules! optional { ($x:expr) => { @@ -28,7 +33,10 @@ static NO_PERMISSION_REGEX: Lazy = static PROFILE_RESULT_REGEX: Lazy = Lazy::new(|| Regex::new(r"^==\d*==\s*Profiling result:\s*$").unwrap()); -pub fn parse_nvprof_csv(reader: &mut impl std::io::BufRead) -> Result { +pub fn seek_to_csv(reader: &mut R) -> Result, ParseError> +where + R: std::io::BufRead, +{ // seek to valid start of csv data let mut lines = reader.by_ref().lines(); for line in &mut lines { @@ -45,70 +53,63 @@ pub fn parse_nvprof_csv(reader: &mut impl std::io::BufRead) -> Result(reader: &mut impl std::io::BufRead) -> Result, ParseError> +where + M: serde::de::DeserializeOwned, +{ + let mut csv_reader = seek_to_csv(reader)?; let mut records = csv_reader.deserialize(); - let mut metrics: HashMap> = HashMap::new(); - let units: HashMap = records.next().ok_or(ParseError::MissingUnits)??; - let values: HashMap = records.next().ok_or(ParseError::MissingMetrics)??; - assert_eq!(units.len(), values.len()); + use indexmap::IndexMap; + let mut entries = Vec::new(); + let units: IndexMap = records.next().ok_or(ParseError::MissingUnits)??; - for (metric, unit) in units { - metrics.entry(metric).or_default().unit = optional!(unit); - } - for (metric, value) in values { - metrics.entry(metric).or_default().value = optional!(value); - } + while let Some(values) = records.next().transpose()? { + assert_eq!(units.len(), values.len()); + let metrics: HashMap> = units + .iter() + .zip(values.iter()) + .map(|((unit_metric, unit), (value_metric, value))| { + assert_eq!(unit_metric, value_metric); + ( + unit_metric.clone(), + Metric { + value: optional!(value).cloned(), + unit: optional!(unit).cloned(), + }, + ) + }) + .collect(); - // this is kind of hacky.. - let metrics = serde_json::to_string(&metrics)?; - let metrics: Metrics = serde_json::from_str(&metrics)?; - Ok(metrics) + // this is kind of hacky.. + let metrics = serde_json::to_string(&metrics)?; + let metrics: M = serde_json::from_str(&metrics)?; + entries.push(metrics); + } + Ok(entries) } #[derive(Debug, Clone)] pub struct Options {} -/// Profile test application using nvbprof profiler. -/// -/// Note: The nvbprof compiler is not recommended for newer devices. -/// -/// # Errors -/// - When creating temp dir fails. -/// - When profiling fails. -/// - When application fails. -#[allow(clippy::too_many_lines)] -pub async fn nvprof( +pub async fn profile_all_metrics( + nvprof: impl AsRef, executable: impl AsRef, args: A, - _options: &Options, -) -> Result + log_file_path: impl AsRef, +) -> Result<(String, Metrics), Error> where A: IntoIterator, ::Item: AsRef, { - let tmp_dir = tempfile::tempdir()?; - let log_file_path = tmp_dir.path().join("log_file.csv"); - - let nvprof = which::which("nvprof").map_err(|_| Error::MissingProfiler("nvprof".into())); - let nvprof = nvprof.or_else(|_| { - let cuda = utils::find_cuda().ok_or(Error::MissingCUDA)?; - Ok::<_, Error>(cuda.join("bin/nvprof")) - })?; - let nvprof = nvprof - .canonicalize() - .map_err(|_| Error::MissingProfiler(nvprof))?; - - let executable = executable - .as_ref() - .canonicalize() - .map_err(|_| Error::MissingExecutable(executable.as_ref().into()))?; - - let mut cmd = Command::new(nvprof); + let mut cmd = async_process::Command::new(nvprof.as_ref()); cmd.args([ "--unified-memory-profiling", "off", @@ -126,8 +127,8 @@ where "--csv", "--log-file", ]) - .arg(&log_file_path) - .arg(&executable) + .arg(log_file_path.as_ref()) + .arg(executable.as_ref()) .args(args.into_iter()); let result = cmd.output().await?; @@ -141,17 +142,114 @@ where let mut log_reader = std::io::BufReader::new(log_file); - let mut original_log = String::new(); - log_reader.read_to_string(&mut original_log)?; - log_reader.rewind()?; + let mut raw_log = String::new(); + log_reader.read_to_string(&mut raw_log)?; + + let mut log_reader = std::io::Cursor::new(&raw_log); + match parse_nvprof_csv(&mut log_reader) { + Err(source) => Err(Error::Parse { raw_log, source }), + Ok(metrics) if metrics.len() != 1 => Err(Error::Parse { + raw_log, + source: ParseError::MissingMetrics, + }), + Ok(mut metrics) => Ok((raw_log, metrics.remove(0))), + } +} + +pub async fn profile_commands( + nvprof: impl AsRef, + executable: impl AsRef, + args: A, + log_file_path: impl AsRef, +) -> Result<(String, Vec), Error> +where + A: IntoIterator, + ::Item: AsRef, +{ + let mut cmd = async_process::Command::new(nvprof.as_ref()); + cmd.args([ + "--unified-memory-profiling", + "off", + "--concurrent-kernels", + "off", + "--print-gpu-trace", + "-u", + "us", + "--demangling", + "off", + "--csv", + "--log-file", + ]) + .arg(log_file_path.as_ref()) + .arg(executable.as_ref()) + .args(args.into_iter()); - let metrics = parse_nvprof_csv(&mut log_reader).map_err(|source| Error::Parse { - raw_log: original_log.clone(), - source, + let result = cmd.output().await?; + if !result.status.success() { + return Err(Error::Command(utils::CommandError::new(&cmd, result))); + } + + let log_file = std::fs::OpenOptions::new() + .read(true) + .open(&log_file_path)?; + + let mut log_reader = std::io::BufReader::new(log_file); + + let mut raw_log = String::new(); + log_reader.read_to_string(&mut raw_log)?; + + let mut log_reader = std::io::Cursor::new(&raw_log); + match parse_nvprof_csv(&mut log_reader) { + Err(source) => Err(Error::Parse { raw_log, source }), + Ok(commands) => Ok((raw_log, commands)), + } +} + +/// Profile test application using nvprof profiler. +/// +/// Note: `nvprof` is not compatible with newer devices. +/// +/// # Errors +/// - When creating temp dir fails. +/// - When profiling fails. +/// - When application fails. +pub async fn nvprof( + executable: impl AsRef, + args: A, + _options: &Options, +) -> Result +where + A: Clone + IntoIterator, + ::Item: AsRef, +{ + let tmp_dir = tempfile::tempdir()?; + let log_file_path = tmp_dir.path().join("log_file.csv"); + + let nvprof = which::which("nvprof").map_err(|_| Error::MissingProfiler("nvprof".into())); + let nvprof = nvprof.or_else(|_| { + let cuda = utils::find_cuda().ok_or(Error::MissingCUDA)?; + Ok::<_, Error>(cuda.join("bin/nvprof")) })?; - Ok(ProfilingResult { - raw: original_log, + let nvprof = nvprof + .canonicalize() + .map_err(|_| Error::MissingProfiler(nvprof))?; + + let executable = executable + .as_ref() + .canonicalize() + .map_err(|_| Error::MissingExecutable(executable.as_ref().into()))?; + + let (raw_metrics_log, metrics) = + profile_all_metrics(&nvprof, &executable, args.clone(), &log_file_path).await?; + + let (raw_commands_log, commands) = + profile_commands(&nvprof, &executable, args, &log_file_path).await?; + + Ok(Output { + raw_metrics_log, + raw_commands_log, metrics, + commands, }) } @@ -159,6 +257,7 @@ where mod tests { use super::{parse_nvprof_csv, Metric}; use color_eyre::eyre; + use similar_asserts as diff; use std::io::Cursor; #[test] @@ -167,24 +266,90 @@ mod tests { let log = String::from_utf8_lossy(bytes).to_string(); dbg!(&log); let mut log_reader = Cursor::new(bytes); - let metrics = parse_nvprof_csv(&mut log_reader)?; + let mut metrics: Vec = parse_nvprof_csv(&mut log_reader)?; + diff::assert_eq!(metrics.len(), 1); + let metrics = metrics.remove(0); dbg!(&metrics); - assert_eq!( + diff::assert_eq!( metrics.device, Metric::new("NVIDIA GeForce GTX 1080 (0)".to_string(), None) ); - assert_eq!( + diff::assert_eq!( metrics.kernel, Metric::new("_Z6vecAddIfEvPT_S1_S1_i".to_string(), None) ); - assert_eq!(metrics.context, Metric::new(1, None)); - assert_eq!(metrics.stream, Metric::new(7, None)); - assert_eq!(metrics.dram_write_bytes, Metric::new(0, None)); - assert_eq!(metrics.dram_read_bytes, Metric::new(7136, None)); - assert_eq!(metrics.dram_read_transactions, Metric::new(223, None)); - assert_eq!(metrics.dram_write_transactions, Metric::new(0, None)); - assert_eq!(metrics.l2_read_transactions, Metric::new(66, None)); - assert_eq!(metrics.l2_write_transactions, Metric::new(26, None)); + diff::assert_eq!(metrics.context, Metric::new(1, None)); + diff::assert_eq!(metrics.stream, Metric::new(7, None)); + diff::assert_eq!(metrics.dram_write_bytes, Metric::new(0, None)); + diff::assert_eq!(metrics.dram_read_bytes, Metric::new(7136, None)); + diff::assert_eq!(metrics.dram_read_transactions, Metric::new(223, None)); + diff::assert_eq!(metrics.dram_write_transactions, Metric::new(0, None)); + diff::assert_eq!(metrics.l2_read_transactions, Metric::new(66, None)); + diff::assert_eq!(metrics.l2_write_transactions, Metric::new(26, None)); + Ok(()) + } + + #[test] + fn parse_commands() -> eyre::Result<()> { + use super::metrics::Command; + let bytes = include_bytes!("../../tests/nvprof_vectoradd_100_32_commands.txt"); + let log = String::from_utf8_lossy(bytes).to_string(); + dbg!(&log); + let mut log_reader = Cursor::new(bytes); + let metrics: Vec = parse_nvprof_csv(&mut log_reader)?; + dbg!(&metrics); + diff::assert_eq!(metrics.len(), 5); + + diff::assert_eq!( + have: metrics[0], + want: Command { + start: Metric::new(245729.104000, "us".to_string()), + duration: Metric::new(1.088000, "us".to_string()), + grid_x: Metric::new(None, None), + grid_y: Metric::new(None, None), + grid_z: Metric::new(None, None), + block_x: Metric::new(None, None), + block_y: Metric::new(None, None), + block_z: Metric::new(None, None), + registers_per_thread: Metric::new(None, None), + static_shared_memory: Metric::new(None, "B".to_string()), + dynamic_shared_memory: Metric::new(None, "B".to_string()), + size: Metric::new(400, "B".to_string()), + throughput: Metric::new(350.615557, "MB/s".to_string()), + src_mem_type: Metric::new("Pageable".to_string(), None), + dest_mem_type: Metric::new("Device".to_string(), None), + device: Metric::new("NVIDIA GeForce GTX 1080 (0)".to_string(), None), + context: Metric::new(1, None), + stream: Metric::new(7, None), + name: Metric::new("[CUDA memcpy HtoD]".to_string(), None), + correlation_id: Metric::new(117, None), + }, + ); + diff::assert_eq!( + have: metrics[3], + want: Command { + start: Metric::new(245767.824000, "us".to_string()), + duration: Metric::new(3.264000, "us".to_string()), + grid_x: Metric::new(1, None), + grid_y: Metric::new(1, None), + grid_z: Metric::new(1, None), + block_x: Metric::new(1024, None), + block_y: Metric::new(1, None), + block_z: Metric::new(1, None), + registers_per_thread: Metric::new(8, None), + static_shared_memory: Metric::new(0, "B".to_string()), + dynamic_shared_memory: Metric::new(0, "B".to_string()), + size: Metric::new(None, "B".to_string()), + throughput: Metric::new(None, "MB/s".to_string()), + src_mem_type: Metric::new(None, None), + dest_mem_type: Metric::new(None, None), + device: Metric::new("NVIDIA GeForce GTX 1080 (0)".to_string(), None), + context: Metric::new(1, None), + stream: Metric::new(7, None), + name: Metric::new("_Z6vecAddIfEvPT_S1_S1_i".to_string(), None), + correlation_id: Metric::new(123, None), + }, + ); Ok(()) } } diff --git a/profile/tests/nvprof_vectoradd_100_32_commands.txt b/profile/tests/nvprof_vectoradd_100_32_commands.txt new file mode 100755 index 00000000..e52e2616 --- /dev/null +++ b/profile/tests/nvprof_vectoradd_100_32_commands.txt @@ -0,0 +1,10 @@ +==2424234== NVPROF is profiling process 2424234, command: /home/roman/dev/box/test-apps/vectoradd/vectoradd 100 32 +==2424234== Profiling application: /home/roman/dev/box/test-apps/vectoradd/vectoradd 100 32 +==2424234== Profiling result: +"Start","Duration","Grid X","Grid Y","Grid Z","Block X","Block Y","Block Z","Registers Per Thread","Static SMem","Dynamic SMem","Size","Throughput","SrcMemType","DstMemType","Device","Context","Stream","Name","Correlation_ID" +us,us,,,,,,,,B,B,B,MB/s,,,,,,, +245729.104000,1.088000,,,,,,,,,,400,350.615557,"Pageable","Device","NVIDIA GeForce GTX 1080 (0)","1","7","[CUDA memcpy HtoD]",117 +245736.176000,0.672000,,,,,,,,,,400,567.663283,"Pageable","Device","NVIDIA GeForce GTX 1080 (0)","1","7","[CUDA memcpy HtoD]",119 +245742.384000,0.672000,,,,,,,,,,400,567.663283,"Pageable","Device","NVIDIA GeForce GTX 1080 (0)","1","7","[CUDA memcpy HtoD]",121 +245767.824000,3.264000,1,1,1,1024,1,1,8,0,0,,,,,"NVIDIA GeForce GTX 1080 (0)","1","7","_Z6vecAddIfEvPT_S1_S1_i",123 +245780.080000,1.152000,,,,,,,,,,400,331.136915,"Device","Pageable","NVIDIA GeForce GTX 1080 (0)","1","7","[CUDA memcpy DtoH]",125 diff --git a/stats/Cargo.toml b/stats/Cargo.toml index e7722255..7a46b712 100644 --- a/stats/Cargo.toml +++ b/stats/Cargo.toml @@ -14,5 +14,6 @@ denylist = ["default"] indexmap = { version = "2", features = ["serde"] } serde = { version = "1", features = ["derive"] } strum = { version = "0", features = ["derive"] } +utils = { path = "../utils" } [dev-dependencies] diff --git a/stats/src/cache.rs b/stats/src/cache.rs index bd8cae46..fa9fd6da 100644 --- a/stats/src/cache.rs +++ b/stats/src/cache.rs @@ -218,17 +218,14 @@ pub type PerCacheCsvRow = (usize, CsvRow); #[allow(clippy::module_name_repetitions)] #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct PerCache(pub Box<[Cache]>); -// pub struct PerCache(pub indexmap::IndexMap); -// pub struct PerCache(pub HashMap); impl PerCache { #[must_use] pub fn new(size: usize) -> Self { - Self(vec![Cache::default(); size].into_boxed_slice()) + Self(utils::box_slice![Cache::default(); size]) } #[must_use] - // pub fn into_inner(self) -> indexmap::IndexMap { pub fn into_inner(self) -> Box<[Cache]> { self.0 } diff --git a/stats/src/dram.rs b/stats/src/dram.rs index 424f9481..e9c33d97 100644 --- a/stats/src/dram.rs +++ b/stats/src/dram.rs @@ -1,66 +1,124 @@ use serde::{Deserialize, Serialize}; +use utils::box_slice; -// use indexmap::IndexMap; -// #[derive(Clone, Default, Debug, PartialEq, Eq, Serialize, Deserialize)] -// pub struct JSONDRAM { -// /// bank writes [shader id][dram chip id][bank id] -// pub bank_writes: IndexMap>>, -// /// bank reads [shader id][dram chip id][bank id] -// pub bank_reads: IndexMap>>, -// /// bank writes [dram chip id][bank id] -// pub total_bank_writes: IndexMap>, -// /// bank reads [dram chip id][bank id] -// pub total_bank_reads: IndexMap>, -// } +#[derive(Clone, Default, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct BankAccessesCsvRow { + /// Core ID + core_id: usize, + /// DRAM chip ID + chip_id: usize, + /// Bank ID + bank_id: usize, + /// Number of reads + reads: u64, + /// Number of writes + writes: u64, +} #[derive(Clone, Default, Debug, PartialEq, Eq, Serialize, Deserialize)] -pub struct PerCoreDRAM { - /// bank writes [shader id][dram chip id][bank id] - pub bank_writes: Vec>>, - /// bank reads [shader id][dram chip id][bank id] - pub bank_reads: Vec>>, +pub struct AccessesCsvRow { + /// DRAM chip ID + chip_id: usize, + /// Bank ID + bank_id: usize, + /// Number of reads + reads: u64, + /// Number of writes + writes: u64, } #[derive(Clone, Default, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct DRAM { - /// bank writes [shader id][dram chip id][bank id] - pub bank_writes: Vec>>, - /// bank reads [shader id][dram chip id][bank id] - pub bank_reads: Vec>>, - /// bank writes [dram chip id][bank id] - pub total_bank_writes: Vec>, - /// bank reads [dram chip id][bank id] - pub total_bank_reads: Vec>, + /// Number of bank writes [shader id][dram chip id][bank id] + pub bank_writes: Box<[Box<[Box<[u64]>]>]>, + /// Number of bank reads [shader id][dram chip id][bank id] + pub bank_reads: Box<[Box<[Box<[u64]>]>]>, + /// Number of bank writes [dram chip id][bank id] + pub total_bank_writes: Box<[Box<[u64]>]>, + /// Number of bank reads [dram chip id][bank id] + pub total_bank_reads: Box<[Box<[u64]>]>, + + /// Number of cores + pub num_cores: usize, + /// Number of DRAM chips + pub num_chips: usize, + /// Number of banks + pub num_banks: usize, } impl DRAM { #[must_use] pub fn new(num_total_cores: usize, num_mem_units: usize, num_banks: usize) -> Self { - let total_bank_writes = vec![vec![0; num_banks]; num_mem_units]; + let total_bank_writes = box_slice![box_slice![0; num_banks]; num_mem_units]; let total_bank_reads = total_bank_writes.clone(); - let bank_reads = vec![total_bank_reads.clone(); num_total_cores]; + let bank_reads = box_slice![total_bank_reads.clone(); num_total_cores]; let bank_writes = bank_reads.clone(); Self { bank_writes, bank_reads, total_bank_writes, total_bank_reads, + num_banks, + num_cores: num_total_cores, + num_chips: num_mem_units, + } + } + + #[must_use] + pub fn bank_accesses_csv(&self) -> Vec { + let mut out = Vec::new(); + for core_id in 0..self.num_cores { + for chip_id in 0..self.num_chips { + for bank_id in 0..self.num_banks { + let reads = self.bank_reads[core_id][chip_id][bank_id]; + let writes = self.bank_writes[core_id][chip_id][bank_id]; + out.push(BankAccessesCsvRow { + core_id, + chip_id, + bank_id, + reads, + writes, + }); + } + } } + out } #[must_use] - pub fn flatten(self) -> Self { - todo!("flatten dram stats"); + pub fn accesses_csv(&self) -> Vec { + let mut out = Vec::new(); + for chip_id in 0..self.num_chips { + for bank_id in 0..self.num_banks { + let reads = self.total_bank_reads[chip_id][bank_id]; + let writes = self.total_bank_writes[chip_id][bank_id]; + out.push(AccessesCsvRow { + chip_id, + bank_id, + reads, + writes, + }); + } + } + out } #[must_use] pub fn total_reads(&self) -> u64 { - self.total_bank_reads.iter().flatten().sum() + self.total_bank_reads + .iter() + .map(AsRef::as_ref) + .flatten() + .sum() } #[must_use] pub fn total_writes(&self) -> u64 { - self.total_bank_writes.iter().flatten().sum() + self.total_bank_writes + .iter() + .map(AsRef::as_ref) + .flatten() + .sum() } // #[must_use] diff --git a/test-apps/test-apps-materialized.yml b/test-apps/test-apps-materialized.yml index cdbd6c5c..799c2d56 100755 --- a/test-apps/test-apps-materialized.yml +++ b/test-apps/test-apps-materialized.yml @@ -2,7 +2,7 @@ ## ## AUTO GENERATED! DO NOT EDIT ## -## this configuration was materialized from /home/roman/dev/box/test-apps/test-apps.yml on 23/08/2023 14:44:48 +## this configuration was materialized from /home/roman/dev/box/test-apps/test-apps.yml on 01/09/2023 01:38:57 ## config: diff --git a/utils/src/lib.rs b/utils/src/lib.rs index 1aba5a85..79acd029 100644 --- a/utils/src/lib.rs +++ b/utils/src/lib.rs @@ -91,6 +91,19 @@ macro_rules! decode_utf8 { }; } +#[macro_export] +macro_rules! box_slice { + () => ( + std::vec::Vec::new().into_boxed_slice() + ); + ($elem:expr; $n:expr) => ( + std::vec::from_elem($elem, $n).into_boxed_slice() + ); + ($($x:expr),+ $(,)?) => ( + std::vec![$($x),+].into_boxed_slice() + ); +} + impl CommandError { pub fn into_eyre(self) -> eyre::Report { let command_section = self.command.clone().header("command:"); diff --git a/validate/src/accelsim.rs b/validate/src/accelsim.rs index a29520da..8613f5ca 100644 --- a/validate/src/accelsim.rs +++ b/validate/src/accelsim.rs @@ -177,10 +177,12 @@ pub async fn simulate( // let flat_stats: Vec<_> = stats.into_inner().into_iter().collect(); // serde_json::to_writer_pretty(open_writable(&stats_out_file)?, &flat_stats)?; - serde_json::to_writer_pretty( - open_writable(stats_dir.join("exec_time.json"))?, - &dur.as_millis(), - ) - .map_err(eyre::Report::from)?; + #[cfg(debug_assertions)] + let exec_time_file_path = stats_dir.join("exec_time.debug.json"); + #[cfg(not(debug_assertions))] + let exec_time_file_path = stats_dir.join("exec_time.release.json"); + + serde_json::to_writer_pretty(open_writable(exec_time_file_path)?, &dur.as_millis()) + .map_err(eyre::Report::from)?; Ok(()) } diff --git a/validate/src/benchmark/matrix.rs b/validate/src/benchmark/matrix.rs index 72c2be36..464b18fc 100644 --- a/validate/src/benchmark/matrix.rs +++ b/validate/src/benchmark/matrix.rs @@ -181,9 +181,6 @@ pub fn expand(inputs: &Inputs, includes: &Includes, excludes: &Excludes) -> Vec< let intersecting_entries: Vec<_> = current_entries.intersection(&include_entries).collect(); - dbg!(&intersecting_keys); - dbg!(&intersecting_entries); - assert!(!current.is_empty()); if intersecting_keys.is_empty() { // does not overwrite anything: extend combination diff --git a/validate/src/options.rs b/validate/src/options.rs index 14d3d123..3a2f6513 100644 --- a/validate/src/options.rs +++ b/validate/src/options.rs @@ -52,7 +52,12 @@ pub struct Options { #[clap(short = 'b', long = "bench", help = "name of benchmark to run")] pub selected_benchmarks: Vec, - #[clap(long = "force", help = "force re-run", default_value = "false")] + #[clap( + short = 'f', + long = "force", + help = "force re-run", + default_value = "false" + )] pub force: bool, #[clap(long = "fail-fast", help = "fail fast", default_value = "false")] diff --git a/validate/src/playground.rs b/validate/src/playground.rs index 3874fb07..9dbb7b0f 100644 --- a/validate/src/playground.rs +++ b/validate/src/playground.rs @@ -92,11 +92,16 @@ pub async fn simulate( create_dirs(&stats_dir).map_err(eyre::Report::from)?; let _stats_out_file = stats_dir.join("stats.json"); - let exec_dur_file = stats_dir.join("exec_time.json"); // let flat_stats: Vec<_> = stats.into_iter().collect(); // serde_json::to_writer_pretty(open_writable(&stats_out_file)?, &flat_stats)?; - serde_json::to_writer_pretty(open_writable(exec_dur_file)?, &dur.as_millis()) + + #[cfg(debug_assertions)] + let exec_time_file_path = stats_dir.join("exec_time.debug.json"); + #[cfg(not(debug_assertions))] + let exec_time_file_path = stats_dir.join("exec_time.release.json"); + + serde_json::to_writer_pretty(open_writable(exec_time_file_path)?, &dur.as_millis()) .map_err(eyre::Report::from)?; Ok(()) } diff --git a/validate/src/profile.rs b/validate/src/profile.rs index ebd43149..3b3888f9 100644 --- a/validate/src/profile.rs +++ b/validate/src/profile.rs @@ -6,6 +6,7 @@ use crate::{ }; use color_eyre::eyre; use std::io::Write; +use std::path::Path; use utils::fs::create_dirs; pub async fn profile( @@ -14,28 +15,45 @@ pub async fn profile( _trace_opts: &options::Profile, ) -> Result<(), RunError> { let profile_dir = &bench.profile.profile_dir; - // dbg!(&profile_dir); create_dirs(profile_dir).map_err(eyre::Report::from)?; - let log_file = profile_dir.join("profile.log"); - let metrics_file = profile_dir.join("profile.metrics.csv"); + let metrics_log_file = profile_dir.join("profile.nvprof.metrics.log"); + let commands_log_file = profile_dir.join("profile.nvprof.commands.log"); + let metrics_file_json = profile_dir.join("profile.metrics.json"); + let commands_file_json = profile_dir.join("profile.commands.json"); - if !options.force && log_file.is_file() && metrics_file.is_file() { + if !options.force + && [ + metrics_log_file.as_path(), + commands_log_file.as_path(), + metrics_file_json.as_path(), + commands_file_json.as_path(), + ] + .into_iter() + .all(Path::is_file) + { return Err(RunError::Skipped); } let options = profile::nvprof::Options {}; - let results = profile::nvprof::nvprof(&bench.executable, &bench.args, &options) + let output = profile::nvprof::nvprof(&bench.executable, &bench.args, &options) .await .map_err(|err| match err { profile::Error::Command(err) => err.into_eyre(), err => err.into(), })?; - serde_json::to_writer_pretty(open_writable(&metrics_file)?, &results.metrics) + open_writable(&metrics_log_file)? + .write_all(output.raw_metrics_log.as_bytes()) .map_err(eyre::Report::from)?; - open_writable(&log_file)? - .write_all(results.raw.as_bytes()) + open_writable(&commands_log_file)? + .write_all(output.raw_commands_log.as_bytes()) .map_err(eyre::Report::from)?; + + serde_json::to_writer_pretty(open_writable(&metrics_file_json)?, &output.metrics) + .map_err(eyre::Report::from)?; + serde_json::to_writer_pretty(open_writable(&commands_file_json)?, &output.commands) + .map_err(eyre::Report::from)?; + Ok(()) } diff --git a/validate/src/simulate.rs b/validate/src/simulate.rs index a4e109e9..640bd286 100644 --- a/validate/src/simulate.rs +++ b/validate/src/simulate.rs @@ -94,11 +94,13 @@ pub async fn simulate( crate::stats::write_stats_as_csv(&stats_dir, stats)?; - serde_json::to_writer_pretty( - open_writable(stats_dir.join("exec_time.json"))?, - &dur.as_millis(), - ) - .map_err(eyre::Report::from)?; + #[cfg(debug_assertions)] + let exec_time_file_path = stats_dir.join("exec_time.debug.json"); + #[cfg(not(debug_assertions))] + let exec_time_file_path = stats_dir.join("exec_time.release.json"); + + serde_json::to_writer_pretty(open_writable(exec_time_file_path)?, &dur.as_millis()) + .map_err(eyre::Report::from)?; // let json_stats: stats::FlatStats = stats.clone().into(); // let json_stats_out_file = stats_dir.join("stats.json"); diff --git a/validate/src/stats.rs b/validate/src/stats.rs index d9fb5839..f617c0f3 100644 --- a/validate/src/stats.rs +++ b/validate/src/stats.rs @@ -62,23 +62,32 @@ pub fn write_csv_rows( pub fn write_stats_as_csv(stats_dir: impl AsRef, stats: stats::Stats) -> eyre::Result<()> { let stats_dir = stats_dir.as_ref(); + // sim stats write_csv_rows(open_writable(sim_stats_path(stats_dir))?, &[stats.sim])?; - // validate::write_csv_rows( - // open_writable(stats_dir.join("stats.dram.csv"))?, - // &[stats::dram::PerCoreDRAM { - // bank_writes: stats.dram.bank_writes, - // bank_reads: stats.dram.bank_reads, - // }], - // )?; + + // dram stats + write_csv_rows( + open_writable(stats_dir.join("stats.dram.csv"))?, + &stats.dram.accesses_csv(), + )?; + write_csv_rows( + open_writable(stats_dir.join("stats.dram.banks.csv"))?, + &stats.dram.bank_accesses_csv(), + )?; + + // access stats write_csv_rows( open_writable(access_stats_path(stats_dir))?, &stats.accesses.flatten(), )?; + + // instruction stats write_csv_rows( open_writable(instruction_stats_path(stats_dir))?, &stats.instructions.flatten(), )?; + // cache stats for (cache, rows) in [ (Cache::L1I, stats.l1i_stats.flatten()), (Cache::L1D, stats.l1d_stats.flatten()),