From 7e979f50ddaa52767c53dc042e670a45a8c39ba6 Mon Sep 17 00:00:00 2001
From: romnnn <roman.dahm@gmx.de>
Date: Fri, 1 Sep 2023 03:24:13 +0200
Subject: [PATCH] validate: profile CUDA commands using nvprof

---
 Cargo.lock                                    |   3 +
 Pipfile                                       |   4 +-
 WIP.md                                        |  11 +-
 accelsim/src/stats.rs                         |  12 +-
 benches/vectoradd.rs                          |   4 +-
 gpucachesim/__init__.py                       |   3 +
 gpucachesim/benchmarks.py                     |  42 ++
 gpucachesim/stats/__init__.py                 |  27 +
 gpucachesim/stats/accelsim.py                 |   2 +
 gpucachesim/stats/native.py                   |   6 +
 gpucachesim/stats/stats.py                    |   6 +
 notebooks/plots.ipynb                         | 591 +++++++++++++++++-
 playground/sys/src/ref/memory_stats.hpp       |   2 -
 profile/Cargo.toml                            |   2 +
 profile/src/lib.rs                            |  23 +-
 profile/src/main.rs                           |  70 ++-
 profile/src/nsight/metrics.rs                 |   5 +
 profile/src/nsight/mod.rs                     |  10 +
 profile/src/nvprof/metrics.rs                 |  44 ++
 profile/src/nvprof/mod.rs                     | 309 ++++++---
 .../nvprof_vectoradd_100_32_commands.txt      |  10 +
 stats/Cargo.toml                              |   1 +
 stats/src/cache.rs                            |   5 +-
 stats/src/dram.rs                             | 120 +++-
 test-apps/test-apps-materialized.yml          |   2 +-
 utils/src/lib.rs                              |  13 +
 validate/src/accelsim.rs                      |  12 +-
 validate/src/benchmark/matrix.rs              |   3 -
 validate/src/options.rs                       |   7 +-
 validate/src/playground.rs                    |   9 +-
 validate/src/profile.rs                       |  34 +-
 validate/src/simulate.rs                      |  12 +-
 validate/src/stats.rs                         |  23 +-
 33 files changed, 1242 insertions(+), 185 deletions(-)
 create mode 100644 gpucachesim/__init__.py
 create mode 100644 gpucachesim/benchmarks.py
 create mode 100644 gpucachesim/stats/__init__.py
 create mode 100644 gpucachesim/stats/accelsim.py
 create mode 100644 gpucachesim/stats/native.py
 create mode 100644 gpucachesim/stats/stats.py
 create mode 100644 profile/src/nsight/metrics.rs
 create mode 100644 profile/src/nsight/mod.rs
 create mode 100755 profile/tests/nvprof_vectoradd_100_32_commands.txt

diff --git a/Cargo.lock b/Cargo.lock
index e4436678..177adcc6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2828,12 +2828,14 @@ dependencies = [
  "clap",
  "color-eyre",
  "csv",
+ "indexmap 2.0.0",
  "log",
  "once_cell",
  "pretty_assertions_sorted",
  "regex",
  "serde",
  "serde_json",
+ "similar-asserts",
  "tempfile",
  "thiserror",
  "tokio",
@@ -3588,6 +3590,7 @@ dependencies = [
  "indexmap 2.0.0",
  "serde",
  "strum",
+ "utils",
 ]
 
 [[package]]
diff --git a/Pipfile b/Pipfile
index 47eb2075..68b6a32a 100644
--- a/Pipfile
+++ b/Pipfile
@@ -11,10 +11,12 @@ matplotlib = "*"
 numpy = "*"
 scipy = "*"
 pandas = "*"
-jupyterlab = "*"
 wasabi = "*"
+click = "*"
 
 [dev-packages]
 invoke = "*"
 flake8 = "*"
 black = "*"
+jupyterlab = "*"
+mypy = "*"
diff --git a/WIP.md b/WIP.md
index c81f933a..ed54c181 100644
--- a/WIP.md
+++ b/WIP.md
@@ -1,16 +1,12 @@
 #### TODO
 
-- notes
-
-  - l2 cache is shared in sub partitions, we get different l2 metrics
-  - l1i are the same always, as they are local to the core
-
 - today:
 
-  - record mem fetch latency in playground and box
-  - write trait for tag array
+  - convert, match and plot statistics
   - rename crates and github repo
   - publish to crates.io
+  - record mem fetch latency in playground and box
+  - DONE: write trait for tag array
 
 - TODO:
 
@@ -21,7 +17,6 @@
   - use traits for common components
   - record mem fetch latency
   - add a few more stats
-  - plot statistics
   - publish python package to pip
 
 - tomorrow:
diff --git a/accelsim/src/stats.rs b/accelsim/src/stats.rs
index 79ab0832..efab47db 100644
--- a/accelsim/src/stats.rs
+++ b/accelsim/src/stats.rs
@@ -1,5 +1,6 @@
 use color_eyre::eyre;
 use std::collections::HashMap;
+use utils::box_slice;
 
 pub type Stat = (String, u16, String);
 pub type Map = indexmap::IndexMap<Stat, f64>;
@@ -102,7 +103,7 @@ impl TryFrom<Stats> for stats::Stats {
         .into_iter()
         .collect();
 
-        let mut l2d_stats = stats::PerCache(vec![stats::Cache::default(); 1].into_boxed_slice());
+        let mut l2d_stats = stats::PerCache(box_slice![stats::Cache::default(); 1]);
         let l2d_total = &mut l2d_stats[0];
         for kind in AccessKind::iter() {
             for reservation_failure in ReservationFailure::iter() {
@@ -160,10 +161,11 @@ impl TryFrom<Stats> for stats::Stats {
             },
             accesses: stats::Accesses(accesses),
             dram: stats::DRAM {
-                bank_writes: vec![vec![vec![total_dram_writes]]],
-                bank_reads: vec![vec![vec![total_dram_reads]]],
-                total_bank_writes: vec![vec![total_dram_writes]],
-                total_bank_reads: vec![vec![total_dram_reads]],
+                bank_writes: box_slice![box_slice![box_slice![total_dram_writes]]],
+                bank_reads: box_slice![box_slice![box_slice![total_dram_reads]]],
+                total_bank_writes: box_slice![box_slice![total_dram_writes]],
+                total_bank_reads: box_slice![box_slice![total_dram_reads]],
+                ..stats::DRAM::default()
             },
             instructions: stats::InstructionCounts::default(),
             l1i_stats: stats::PerCache::new(0),
diff --git a/benches/vectoradd.rs b/benches/vectoradd.rs
index f1333407..32a0eb1b 100644
--- a/benches/vectoradd.rs
+++ b/benches/vectoradd.rs
@@ -111,10 +111,10 @@ pub fn box_benchmark(c: &mut Criterion) {
 }
 
 criterion::criterion_group!(benches, box_benchmark, play_benchmark, accelsim_benchmark);
-// criterion::criterion_main!(benches);
+criterion::criterion_main!(benches);
 
 #[allow(dead_code)]
-fn main() -> eyre::Result<()> {
+fn main_other() -> eyre::Result<()> {
     use itertools::Itertools;
     #[allow(unused_imports)]
     use std::io::Write;
diff --git a/gpucachesim/__init__.py b/gpucachesim/__init__.py
new file mode 100644
index 00000000..caceb2e4
--- /dev/null
+++ b/gpucachesim/__init__.py
@@ -0,0 +1,3 @@
+from pathlib import Path
+
+ROOT_DIR = Path(__file__).parent
diff --git a/gpucachesim/benchmarks.py b/gpucachesim/benchmarks.py
new file mode 100644
index 00000000..a58dc494
--- /dev/null
+++ b/gpucachesim/benchmarks.py
@@ -0,0 +1,42 @@
+import click
+import yaml
+from pathlib import Path
+from os import PathLike
+from typing import Optional
+
+from gpucachesim import ROOT_DIR
+
+REPO_ROOT_DIR = ROOT_DIR.parent
+DEFAULT_BENCH_FILE = REPO_ROOT_DIR / "test-apps/test-apps-materialized.yml"
+
+
+class Benchmarks:
+    def __init__(self, path: PathLike) -> None:
+        """load the materialized benchmark config"""
+
+        with open(path or DEFAULT_BENCH_FILE, "rb") as f:
+            benchmarks = yaml.safe_load(f)
+
+        self.benchmarks = benchmarks["benchmarks"]
+
+    def __getitem__(self, bench_name: str):
+        return self.benchmarks[bench_name]
+
+    def get_bench_config(self, bench_name: str, input_idx: int):
+        return self.benchmarks[bench_name][input_idx]
+
+
+@click.command()
+@click.option("--path", default=DEFAULT_BENCH_FILE, help="Path to materialized benchmark config")
+def main(path):
+    from pprint import pprint
+
+    print(path)
+    b = Benchmarks(path)
+
+    benchmark_names = list(b.benchmarks.keys())
+    pprint(benchmark_names)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/gpucachesim/stats/__init__.py b/gpucachesim/stats/__init__.py
new file mode 100644
index 00000000..c67a3682
--- /dev/null
+++ b/gpucachesim/stats/__init__.py
@@ -0,0 +1,27 @@
+import click
+
+import gpucachesim.stats.stats as stats
+import gpucachesim.stats.native as native
+from gpucachesim.benchmarks import Benchmarks
+
+
+@click.command()
+@click.option("--path", help="Path to materialized benchmark config")
+@click.option("--bench", help="Benchmark name")
+@click.option("--input", default=0, help="Input index")
+def main(path, bench, input):
+    from pprint import pprint
+
+    b = Benchmarks(path)
+    if bench is None:
+        raise NotImplemented
+    print(bench, input)
+    bench_config = b.get_bench_config(bench, input)
+    # pprint(bench_config)
+
+    our_stats = stats.Stats(bench_config["simulate"])
+    native_stats = native.Stats(bench_config["simulate"])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/gpucachesim/stats/accelsim.py b/gpucachesim/stats/accelsim.py
new file mode 100644
index 00000000..c8fe4366
--- /dev/null
+++ b/gpucachesim/stats/accelsim.py
@@ -0,0 +1,2 @@
+class AccelsimStats:
+    pass
diff --git a/gpucachesim/stats/native.py b/gpucachesim/stats/native.py
new file mode 100644
index 00000000..b6300e48
--- /dev/null
+++ b/gpucachesim/stats/native.py
@@ -0,0 +1,6 @@
+from os import PathLike
+
+
+class Stats:
+    def __init__(self, result_dir: PathLike) -> None:
+        self.path = result_dir
diff --git a/gpucachesim/stats/stats.py b/gpucachesim/stats/stats.py
new file mode 100644
index 00000000..b6300e48
--- /dev/null
+++ b/gpucachesim/stats/stats.py
@@ -0,0 +1,6 @@
+from os import PathLike
+
+
+class Stats:
+    def __init__(self, result_dir: PathLike) -> None:
+        self.path = result_dir
diff --git a/notebooks/plots.ipynb b/notebooks/plots.ipynb
index dc2ee541..bc9eac3e 100644
--- a/notebooks/plots.ipynb
+++ b/notebooks/plots.ipynb
@@ -25,11 +25,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['vectorAdd', 'simple_matrixmul']\n"
+      "['vectorAdd', 'simple_matrixmul', 'matrixmul', 'transpose']\n"
      ]
     }
    ],
    "source": [
+    "# load the materialized benchmark config\n",
     "benchmark_file = \"../test-apps/test-apps-materialized.yml\"\n",
     "with open(benchmark_file, \"rb\") as f:\n",
     "    benchmarks = yaml.safe_load(f)\n",
@@ -46,6 +47,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# define targets to use\n",
     "targets = {\n",
     "    \"accelsim_simulate\": \"\",\n",
     "    \"simulate\": \"\",\n",
@@ -58,7 +60,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "id": "165b6c35-95c7-458f-9f4e-3b25d21122bd",
    "metadata": {},
    "outputs": [
@@ -68,14 +70,69 @@
      "text": [
       "[(0, '/home/roman/dev/box/test-apps/vectoradd/vectoradd', ['100', '32']),\n",
       " (1, '/home/roman/dev/box/test-apps/vectoradd/vectoradd', ['1000', '32']),\n",
-      " (2, '/home/roman/dev/box/test-apps/vectoradd/vectoradd', ['10000', '32'])]\n"
+      " (2, '/home/roman/dev/box/test-apps/vectoradd/vectoradd', ['10000', '32'])]\n",
+      "{'accelsim_simulate': {'concurrency': None,\n",
+      "                       'config': '/home/roman/dev/box/accelsim/gtx1080/gpgpusim.config',\n",
+      "                       'config_dir': '/home/roman/dev/box/accelsim/gtx1080',\n",
+      "                       'enabled': True,\n",
+      "                       'inter_config': '/home/roman/dev/box/accelsim/gtx1080/config_fermi_islip.icnt',\n",
+      "                       'repetitions': 2,\n",
+      "                       'results_dir': '/home/roman/dev/box/results',\n",
+      "                       'stats_dir': '/home/roman/dev/box/results/vectorAdd/vectorAdd-dtype-32-length-100/accelsim-sim',\n",
+      "                       'timeout': None,\n",
+      "                       'trace_config': '/home/roman/dev/box/accelsim/gtx1080/gpgpusim.trace.config'},\n",
+      " 'accelsim_trace': {'concurrency': 1,\n",
+      "                    'enabled': True,\n",
+      "                    'repetitions': 1,\n",
+      "                    'results_dir': '/home/roman/dev/box/results',\n",
+      "                    'timeout': None,\n",
+      "                    'traces_dir': '/home/roman/dev/box/results/vectorAdd/vectorAdd-dtype-32-length-100/accelsim-trace'},\n",
+      " 'args': ['100', '32'],\n",
+      " 'benchmark_idx': 0,\n",
+      " 'executable': '/home/roman/dev/box/test-apps/vectoradd/vectoradd',\n",
+      " 'input_idx': 0,\n",
+      " 'name': 'vectorAdd',\n",
+      " 'path': '/home/roman/dev/box/test-apps/vectoradd',\n",
+      " 'playground_simulate': {'concurrency': 1,\n",
+      "                         'config': '/home/roman/dev/box/accelsim/gtx1080/gpgpusim.config',\n",
+      "                         'config_dir': '/home/roman/dev/box/accelsim/gtx1080',\n",
+      "                         'enabled': True,\n",
+      "                         'inter_config': '/home/roman/dev/box/accelsim/gtx1080/config_fermi_islip.icnt',\n",
+      "                         'repetitions': 2,\n",
+      "                         'results_dir': '/home/roman/dev/box/results',\n",
+      "                         'stats_dir': '/home/roman/dev/box/results/vectorAdd/vectorAdd-dtype-32-length-100/playground-sim',\n",
+      "                         'timeout': None,\n",
+      "                         'trace_config': '/home/roman/dev/box/accelsim/gtx1080/gpgpusim.trace.config'},\n",
+      " 'profile': {'concurrency': 1,\n",
+      "             'enabled': True,\n",
+      "             'profile_dir': '/home/roman/dev/box/results/vectorAdd/vectorAdd-dtype-32-length-100/profile',\n",
+      "             'repetitions': 5,\n",
+      "             'results_dir': '/home/roman/dev/box/results',\n",
+      "             'timeout': None},\n",
+      " 'simulate': {'concurrency': None,\n",
+      "              'enabled': True,\n",
+      "              'parallel': False,\n",
+      "              'repetitions': 2,\n",
+      "              'results_dir': '/home/roman/dev/box/results',\n",
+      "              'stats_dir': '/home/roman/dev/box/results/vectorAdd/vectorAdd-dtype-32-length-100/sim',\n",
+      "              'timeout': None},\n",
+      " 'trace': {'concurrency': 1,\n",
+      "           'enabled': True,\n",
+      "           'full_trace': False,\n",
+      "           'repetitions': 1,\n",
+      "           'results_dir': '/home/roman/dev/box/results',\n",
+      "           'save_json': True,\n",
+      "           'timeout': None,\n",
+      "           'traces_dir': '/home/roman/dev/box/results/vectorAdd/vectorAdd-dtype-32-length-100/trace'},\n",
+      " 'values': {'dtype': 32, 'length': 100}}\n"
      ]
     }
    ],
    "source": [
+    "# check all benchmark configs for vectoradd\n",
     "vectoradd = benchmarks[\"vectorAdd\"]\n",
     "pprint([(b[\"input_idx\"], b[\"executable\"], b[\"args\"]) for b in vectoradd])\n",
-    "# pprint(vectoradd[0])"
+    "pprint(vectoradd[0])"
    ]
   },
   {
@@ -202,6 +259,147 @@
     "sim_df"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "82d165ef-3b69-4ca6-b845-b74afa499946",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>chip_id</th>\n",
+       "      <th>bank_id</th>\n",
+       "      <th>reads</th>\n",
+       "      <th>writes</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   chip_id  bank_id  reads  writes\n",
+       "0        0        0      4       0\n",
+       "1        0        1      4       0"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dram_df = pd.read_csv(Path(benchmarks[\"vectorAdd\"][0][\"simulate\"][\"stats_dir\"]) / \"stats.dram.csv\")\n",
+    "dram_total = dram_df[\"reads\"] + dram_df[\"writes\"]\n",
+    "dram_df[dram_total > 0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "dd0edc19-ca39-44d8-b01f-9a771b871d35",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>core_id</th>\n",
+       "      <th>chip_id</th>\n",
+       "      <th>bank_id</th>\n",
+       "      <th>reads</th>\n",
+       "      <th>writes</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   core_id  chip_id  bank_id  reads  writes\n",
+       "0        0        0        0      1       0\n",
+       "1        0        0        1      1       0"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dram_banks_df = pd.read_csv(Path(benchmarks[\"vectorAdd\"][0][\"simulate\"][\"stats_dir\"]) / \"stats.dram.banks.csv\")\n",
+    "dram_banks_total = dram_banks_df[\"reads\"] + dram_banks_df[\"writes\"]\n",
+    "dram_banks_df[dram_banks_total > 0]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 33,
@@ -523,6 +721,389 @@
     "accel_stats_df"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "23fdd9a9-ecd4-4103-becb-291c924855e1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>value</th>\n",
+       "      <th>unit</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Device</th>\n",
+       "      <td>NVIDIA GeForce GTX 1080 (0)</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Context</th>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Stream</th>\n",
+       "      <td>7.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Kernel</th>\n",
+       "      <td>_Z6vecAddIfEvPT_S1_S1_i</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Correlation_ID</th>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>shared_load_transactions_per_request</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>shared_store_transactions_per_request</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>local_load_transactions_per_request</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>local_store_transactions_per_request</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>gld_transactions_per_request</th>\n",
+       "      <td>13.25</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>gst_transactions_per_request</th>\n",
+       "      <td>3.25</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>shared_store_transactions</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>shared_load_transactions</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>local_load_transactions</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>local_store_transactions</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>gld_transactions</th>\n",
+       "      <td>106.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>gst_transactions</th>\n",
+       "      <td>13.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>sysmem_read_transactions</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>sysmem_write_transactions</th>\n",
+       "      <td>5.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>l2_read_transactions</th>\n",
+       "      <td>660.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>l2_write_transactions</th>\n",
+       "      <td>26.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>atomic_transactions</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>atomic_transactions_per_request</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>l2_global_load_bytes</th>\n",
+       "      <td>832.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>l2_local_load_bytes</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>l2_surface_load_bytes</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>l2_local_global_store_bytes</th>\n",
+       "      <td>416.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>l2_global_reduction_bytes</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>l2_global_atomic_store_bytes</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>l2_surface_store_bytes</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>l2_surface_reduction_bytes</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>l2_surface_atomic_store_bytes</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>global_load_requests</th>\n",
+       "      <td>26.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>local_load_requests</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>surface_load_requests</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>global_store_requests</th>\n",
+       "      <td>13.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>local_store_requests</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>surface_store_requests</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>global_atomic_requests</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>global_reduction_requests</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>surface_atomic_requests</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>surface_reduction_requests</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dram_read_transactions</th>\n",
+       "      <td>72.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dram_write_transactions</th>\n",
+       "      <td>12.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dram_read_throughput</th>\n",
+       "      <td>1.151163</td>\n",
+       "      <td>GB/s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dram_write_throughput</th>\n",
+       "      <td>196.46509</td>\n",
+       "      <td>MB/s</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dram_write_bytes</th>\n",
+       "      <td>384.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>dram_read_bytes</th>\n",
+       "      <td>2304.0</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                             value  unit\n",
+       "Device                                 NVIDIA GeForce GTX 1080 (0)  None\n",
+       "Context                                                        1.0   NaN\n",
+       "Stream                                                         7.0   NaN\n",
+       "Kernel                                     _Z6vecAddIfEvPT_S1_S1_i  None\n",
+       "Correlation_ID                                                 1.0   NaN\n",
+       "shared_load_transactions_per_request                           0.0   NaN\n",
+       "shared_store_transactions_per_request                          0.0   NaN\n",
+       "local_load_transactions_per_request                            0.0   NaN\n",
+       "local_store_transactions_per_request                           0.0   NaN\n",
+       "gld_transactions_per_request                                 13.25   NaN\n",
+       "gst_transactions_per_request                                  3.25   NaN\n",
+       "shared_store_transactions                                      0.0   NaN\n",
+       "shared_load_transactions                                       0.0   NaN\n",
+       "local_load_transactions                                        0.0   NaN\n",
+       "local_store_transactions                                       0.0   NaN\n",
+       "gld_transactions                                             106.0   NaN\n",
+       "gst_transactions                                              13.0   NaN\n",
+       "sysmem_read_transactions                                       0.0   NaN\n",
+       "sysmem_write_transactions                                      5.0   NaN\n",
+       "l2_read_transactions                                         660.0   NaN\n",
+       "l2_write_transactions                                         26.0   NaN\n",
+       "atomic_transactions                                            0.0   NaN\n",
+       "atomic_transactions_per_request                                0.0   NaN\n",
+       "l2_global_load_bytes                                         832.0   NaN\n",
+       "l2_local_load_bytes                                            0.0   NaN\n",
+       "l2_surface_load_bytes                                          0.0   NaN\n",
+       "l2_local_global_store_bytes                                  416.0   NaN\n",
+       "l2_global_reduction_bytes                                      0.0   NaN\n",
+       "l2_global_atomic_store_bytes                                   0.0   NaN\n",
+       "l2_surface_store_bytes                                         0.0   NaN\n",
+       "l2_surface_reduction_bytes                                     0.0   NaN\n",
+       "l2_surface_atomic_store_bytes                                  0.0   NaN\n",
+       "global_load_requests                                          26.0   NaN\n",
+       "local_load_requests                                            0.0   NaN\n",
+       "surface_load_requests                                          0.0   NaN\n",
+       "global_store_requests                                         13.0   NaN\n",
+       "local_store_requests                                           0.0   NaN\n",
+       "surface_store_requests                                         0.0   NaN\n",
+       "global_atomic_requests                                         0.0   NaN\n",
+       "global_reduction_requests                                      0.0   NaN\n",
+       "surface_atomic_requests                                        0.0   NaN\n",
+       "surface_reduction_requests                                     0.0   NaN\n",
+       "dram_read_transactions                                        72.0   NaN\n",
+       "dram_write_transactions                                       12.0   NaN\n",
+       "dram_read_throughput                                      1.151163  GB/s\n",
+       "dram_write_throughput                                    196.46509  MB/s\n",
+       "dram_write_bytes                                             384.0   NaN\n",
+       "dram_read_bytes                                             2304.0   NaN"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "native_df = pd.read_json(Path(benchmarks[\"vectorAdd\"][0][\"profile\"][\"profile_dir\"]) / \"profile.metrics.json\").T\n",
+    "native_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "e109b979-c2f4-4386-9989-b95bded18124",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'hw_cycle_df' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[40], line 7\u001b[0m\n\u001b[1;32m      4\u001b[0m native_commands_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame\u001b[38;5;241m.\u001b[39mfrom_dict([{k: v[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalue\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m e\u001b[38;5;241m.\u001b[39mitems()} \u001b[38;5;28;01mfor\u001b[39;00m e \u001b[38;5;129;01min\u001b[39;00m commands_json])\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m      6\u001b[0m     \u001b[38;5;66;03m# , header=None, names=[\"kernel\", \"kernel_id\", \"stat\", \"value\"])\u001b[39;00m\n\u001b[0;32m----> 7\u001b[0m     native_commands_df \u001b[38;5;241m=\u001b[39m native_commands_df[\u001b[38;5;241m~\u001b[39m\u001b[43mhw_cycle_df\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCorrelation_ID\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39misnull()]\n\u001b[1;32m      8\u001b[0m     \u001b[38;5;66;03m# remove memcopies\u001b[39;00m\n\u001b[1;32m      9\u001b[0m     native_commands_df \u001b[38;5;241m=\u001b[39m native_commands_df[\u001b[38;5;241m~\u001b[39mhw_cycle_df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mName\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124m[CUDA memcpy .*\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m)]\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'hw_cycle_df' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "with open(Path(benchmarks[\"vectorAdd\"][0][\"profile\"][\"profile_dir\"]) / \"profile.commands.json\", \"rb\") as f:\n",
+    "    commands_json = json.load(f)\n",
+    "# print(commands_json)\n",
+    "native_commands_df = pd.DataFrame.from_dict([{k: v[\"value\"] for k, v in e.items()} for e in commands_json])\n",
+    "if True:\n",
+    "    # , header=None, names=[\"kernel\", \"kernel_id\", \"stat\", \"value\"])\n",
+    "    native_commands_df = native_commands_df[~native_commands_df[\"Correlation_ID\"].isnull()]\n",
+    "    # remove memcopies\n",
+    "    native_commands_df = native_commands_df[~native_commands_df[\"Name\"].str.contains(r\"\\[CUDA memcpy .*\\]\")]\n",
+    "    # name refers to kernels now\n",
+    "    native_commands_df = native_commands_df.rename(columns={\"Name\": \"Kernel\"})\n",
+    "    # remove columns that are only relevant for memcopies\n",
+    "    # df = df.loc[:,df.notna().any(axis=0)]\n",
+    "    native_commands_df = native_commands_df.drop(columns=[\"Size\", \"Throughput\", \"SrcMemType\", \"DstMemType\"])\n",
+    "    # set the correct dtypes\n",
+    "    native_commands_df = native_commands_df.astype({\n",
+    "        \"Start\": \"float64\",\n",
+    "        \"Duration\": \"float64\",\n",
+    "        \"Static SMem\": \"float64\",\n",
+    "        \"Dynamic SMem\": \"float64\",\n",
+    "        \"Device\": \"string\",\n",
+    "        \"Kernel\": \"string\",\n",
+    "    })\n",
+    "native_commands_df"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 35,
@@ -800,7 +1381,7 @@
     " 'l2_cache_texture_read_mshr_hit',\n",
     " 'l2_cache_texture_read_reservation_fail',\n",
     " 'l2_cache_texture_read_sector_miss',\n",
-    "'num_dram_full_stalls',\n",
+    " 'num_dram_full_stalls',\n",
     " 'num_global_mem_read',\n",
     " 'num_global_mem_write',\n",
     " 'num_interconn_to_shared_mem_stalls',\n",
diff --git a/playground/sys/src/ref/memory_stats.hpp b/playground/sys/src/ref/memory_stats.hpp
index 3efb06cb..e82a0fb3 100644
--- a/playground/sys/src/ref/memory_stats.hpp
+++ b/playground/sys/src/ref/memory_stats.hpp
@@ -1,7 +1,5 @@
 #pragma once
 
-// #include <zlib.h>
-
 #include "memory_config.hpp"
 #include "shader_core_config.hpp"
 
diff --git a/profile/Cargo.toml b/profile/Cargo.toml
index 4e9a44e0..f5b0efbf 100644
--- a/profile/Cargo.toml
+++ b/profile/Cargo.toml
@@ -21,6 +21,7 @@ which = "4"
 csv = "1"
 serde = { version = "1", features = ["derive"] }
 serde_json = "1"
+indexmap = { version = "2", features = ["serde"] }
 regex = "1"
 log = "0"
 tokio = { version = "1", features = ["full"] }
@@ -32,3 +33,4 @@ utils = { path = "../utils" }
 
 [dev-dependencies]
 pretty_assertions_sorted = "1"
+similar-asserts = "1"
diff --git a/profile/src/lib.rs b/profile/src/lib.rs
index 7f54d839..48a59b5b 100644
--- a/profile/src/lib.rs
+++ b/profile/src/lib.rs
@@ -1,9 +1,9 @@
 #![allow(clippy::missing_panics_doc, clippy::missing_errors_doc)]
-// #![allow(warnings)]
+pub mod nsight;
 pub mod nvprof;
 
 use serde::Deserialize;
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 
 #[derive(thiserror::Error, Debug)]
 pub enum ParseError {
@@ -94,8 +94,19 @@ where
     }
 }
 
-#[derive(PartialEq, Clone, Debug, Default, serde::Serialize, serde::Deserialize)]
-pub struct ProfilingResult<M> {
-    pub raw: String,
-    pub metrics: M,
+#[derive(PartialEq, Clone, Debug, serde::Serialize, serde::Deserialize)]
+pub enum Metrics {
+    /// Nvprof profiler metrics
+    Nvprof(nvprof::Output),
+    /// Nsight profiler metrics
+    Nsight(nsight::Output),
+}
+
+/// Profile test application using either the nvprof or nsight compute profiler.
+pub async fn nvprof<A>(executable: impl AsRef<Path>, args: A) -> Result<Metrics, Error>
+where
+    A: Clone + IntoIterator,
+    <A as IntoIterator>::Item: AsRef<std::ffi::OsStr>,
+{
+    unimplemented!()
 }
diff --git a/profile/src/main.rs b/profile/src/main.rs
index 8a093704..342aa6f5 100644
--- a/profile/src/main.rs
+++ b/profile/src/main.rs
@@ -1,6 +1,6 @@
 use color_eyre::eyre;
 
-use clap::{CommandFactory, Parser};
+use clap::{CommandFactory, Parser, Subcommand};
 use std::path::PathBuf;
 
 const HELP_TEMPLATE: &str = "{bin} {version} {author}
@@ -12,7 +12,36 @@ USAGE: {usage}
 {all-args}
 ";
 
-const USAGE: &str = "./profile [OPTIONS] -- <executable> [args]";
+const USAGE: &str = "./profile [nvprof|nsight|auto] [OPTIONS] -- <executable> [args]";
+
+/// Options for the nvprof profiler.
+#[derive(Parser, Debug, Clone)]
+pub struct NvprofOptions {
+    #[clap(long = "log-file", help = "output log file")]
+    pub log_file: Option<PathBuf>,
+}
+
+impl From<NvprofOptions> for profile::nvprof::Options {
+    fn from(_options: NvprofOptions) -> Self {
+        Self {}
+    }
+}
+
+/// Options for the nsight profiler.
+#[derive(Parser, Debug, Clone)]
+pub struct NsightOptions {
+    #[clap(long = "log-file", help = "output log file")]
+    pub log_file: Option<PathBuf>,
+}
+
+#[derive(Subcommand, Debug, Clone)]
+pub enum Command {
+    Auto,
+    /// Profile using `nvprof`
+    Nvprof(NvprofOptions),
+    /// Profile using `nsight-compute`
+    Nsight(NsightOptions),
+}
 
 #[derive(Parser, Debug, Clone)]
 #[clap(
@@ -23,10 +52,11 @@ const USAGE: &str = "./profile [OPTIONS] -- <executable> [args]";
     author = "romnn <contact@romnn.com>",
 )]
 pub struct Options {
-    #[clap(long = "log-file", help = "output log file")]
-    pub log_file: Option<PathBuf>,
     #[clap(long = "metrics-file", help = "output metrics file")]
     pub metrics_file: Option<PathBuf>,
+
+    #[clap(subcommand)]
+    pub command: Option<Command>,
 }
 
 fn parse_args() -> Result<(PathBuf, Vec<String>, Options), clap::Error> {
@@ -58,23 +88,29 @@ async fn main() -> eyre::Result<()> {
 
     let start = std::time::Instant::now();
 
-    let (exec, exec_args, _options) = match parse_args() {
+    let (exec, exec_args, options) = match parse_args() {
         Ok(parsed) => parsed,
         Err(err) => err.exit(),
     };
 
-    let options = profile::nvprof::Options {};
-    let profile::ProfilingResult { .. } = profile::nvprof::nvprof(exec, exec_args, &options)
-        .await
-        .map_err(|err| match err {
-            profile::Error::Command(err) => err.into_eyre(),
-            other => other.into(),
-        })?;
-
-    // todo: nice table view of the most important things
-    // todo: dump the raw output
-    // todo: dump the parsed output as json
-    // println!("{:#?}", &metrics);
+    let _output = match options.command {
+        None | Some(Command::Auto) => todo!(),
+        Some(Command::Nvprof(nvprof_options)) => {
+            let output = profile::nvprof::nvprof(exec, exec_args, &nvprof_options.into())
+                .await
+                .map_err(|err| match err {
+                    profile::Error::Command(err) => err.into_eyre(),
+                    other => other.into(),
+                })?;
+            profile::Metrics::Nvprof(output)
+        }
+        Some(Command::Nsight(_nsight_options)) => todo!(),
+    };
+
+    // TODO: nice table view of the most important things
+    // TODO: dump the raw output
+    // TODO: dump the parsed output as json
+    // println!("{:#?}", &output.metrics);
     println!("profiling done in {:?}", start.elapsed());
     Ok(())
 }
diff --git a/profile/src/nsight/metrics.rs b/profile/src/nsight/metrics.rs
new file mode 100644
index 00000000..211162e2
--- /dev/null
+++ b/profile/src/nsight/metrics.rs
@@ -0,0 +1,5 @@
+#[derive(PartialEq, Clone, Debug, Default, serde::Serialize, serde::Deserialize)]
+pub struct Metrics {
+    // #[serde(rename = "Device")]
+    // pub device: Metric<String>,
+}
diff --git a/profile/src/nsight/mod.rs b/profile/src/nsight/mod.rs
new file mode 100644
index 00000000..aafdb828
--- /dev/null
+++ b/profile/src/nsight/mod.rs
@@ -0,0 +1,10 @@
+mod metrics;
+
+pub use metrics::Metrics;
+
+#[derive(PartialEq, Clone, Debug, Default, serde::Serialize, serde::Deserialize)]
+pub struct Output {
+    // pub raw_metrics_log: String,
+    // pub raw_commands_log: String,
+    // pub metrics: Metrics,
+}
diff --git a/profile/src/nvprof/metrics.rs b/profile/src/nvprof/metrics.rs
index f7b1f33e..8f32aa4f 100644
--- a/profile/src/nvprof/metrics.rs
+++ b/profile/src/nvprof/metrics.rs
@@ -188,3 +188,47 @@ pub struct Metrics {
     pub dram_read_bytes: Metric<usize>,
     // TEMP
 }
+
+#[derive(PartialEq, Clone, Debug, Default, serde::Serialize, serde::Deserialize)]
+pub struct Command {
+    #[serde(rename = "Start")]
+    pub start: Metric<f32>,
+    #[serde(rename = "Duration")]
+    pub duration: Metric<f32>,
+    #[serde(rename = "Grid X", default)]
+    pub grid_x: Metric<usize>,
+    #[serde(rename = "Grid Y", default)]
+    pub grid_y: Metric<usize>,
+    #[serde(rename = "Grid Z", default)]
+    pub grid_z: Metric<usize>,
+    #[serde(rename = "Block X", default)]
+    pub block_x: Metric<usize>,
+    #[serde(rename = "Block Y", default)]
+    pub block_y: Metric<usize>,
+    #[serde(rename = "Block Z", default)]
+    pub block_z: Metric<usize>,
+    #[serde(rename = "Registers Per Thread")]
+    pub registers_per_thread: Metric<usize>,
+    #[serde(rename = "Static SMem")]
+    pub static_shared_memory: Metric<usize>,
+    #[serde(rename = "Dynamic SMem")]
+    pub dynamic_shared_memory: Metric<usize>,
+    #[serde(rename = "Size")]
+    pub size: Metric<usize>,
+    #[serde(rename = "Throughput")]
+    pub throughput: Metric<f32>,
+    #[serde(rename = "SrcMemType")]
+    pub src_mem_type: Metric<String>,
+    #[serde(rename = "DstMemType")]
+    pub dest_mem_type: Metric<String>,
+    #[serde(rename = "Device")]
+    pub device: Metric<String>,
+    #[serde(rename = "Context")]
+    pub context: Metric<usize>,
+    #[serde(rename = "Stream")]
+    pub stream: Metric<usize>,
+    #[serde(rename = "Name")]
+    pub name: Metric<String>,
+    #[serde(rename = "Correlation_ID")]
+    pub correlation_id: Metric<usize>,
+}
diff --git a/profile/src/nvprof/mod.rs b/profile/src/nvprof/mod.rs
index 8e5825ff..ba39abc0 100644
--- a/profile/src/nvprof/mod.rs
+++ b/profile/src/nvprof/mod.rs
@@ -1,16 +1,21 @@
 mod metrics;
 
-use async_process::Command;
 use once_cell::sync::Lazy;
 use regex::Regex;
 use std::collections::HashMap;
-use std::io::{BufRead, Read, Seek};
+use std::io::{BufRead, Read};
 use std::path::Path;
 
 use crate::{Error, Metric, ParseError};
-pub use metrics::Metrics;
+pub use metrics::{Command, Metrics};
 
-pub type ProfilingResult = super::ProfilingResult<Metrics>;
+#[derive(PartialEq, Clone, Debug, Default, serde::Serialize, serde::Deserialize)]
+pub struct Output {
+    pub raw_metrics_log: String,
+    pub raw_commands_log: String,
+    pub metrics: Metrics,
+    pub commands: Vec<Command>,
+}
 
 macro_rules! optional {
     ($x:expr) => {
@@ -28,7 +33,10 @@ static NO_PERMISSION_REGEX: Lazy<Regex> =
 static PROFILE_RESULT_REGEX: Lazy<Regex> =
     Lazy::new(|| Regex::new(r"^==\d*==\s*Profiling result:\s*$").unwrap());
 
-pub fn parse_nvprof_csv(reader: &mut impl std::io::BufRead) -> Result<Metrics, ParseError> {
+pub fn seek_to_csv<R>(reader: &mut R) -> Result<csv::Reader<&mut R>, ParseError>
+where
+    R: std::io::BufRead,
+{
     // seek to valid start of csv data
     let mut lines = reader.by_ref().lines();
     for line in &mut lines {
@@ -45,70 +53,63 @@ pub fn parse_nvprof_csv(reader: &mut impl std::io::BufRead) -> Result<Metrics, P
         }
     }
 
-    // upgrade reader to a csv reader, keeping the current position
-    let mut csv_reader = csv::ReaderBuilder::new()
+    // upgrade reader to a csv reader and start reading from current position
+    let csv_reader = csv::ReaderBuilder::new()
         .flexible(false)
         .from_reader(reader);
+    Ok(csv_reader)
+}
 
+pub fn parse_nvprof_csv<M>(reader: &mut impl std::io::BufRead) -> Result<Vec<M>, ParseError>
+where
+    M: serde::de::DeserializeOwned,
+{
+    let mut csv_reader = seek_to_csv(reader)?;
     let mut records = csv_reader.deserialize();
 
-    let mut metrics: HashMap<String, Metric<String>> = HashMap::new();
-    let units: HashMap<String, String> = records.next().ok_or(ParseError::MissingUnits)??;
-    let values: HashMap<String, String> = records.next().ok_or(ParseError::MissingMetrics)??;
-    assert_eq!(units.len(), values.len());
+    use indexmap::IndexMap;
+    let mut entries = Vec::new();
+    let units: IndexMap<String, String> = records.next().ok_or(ParseError::MissingUnits)??;
 
-    for (metric, unit) in units {
-        metrics.entry(metric).or_default().unit = optional!(unit);
-    }
-    for (metric, value) in values {
-        metrics.entry(metric).or_default().value = optional!(value);
-    }
+    while let Some(values) = records.next().transpose()? {
+        assert_eq!(units.len(), values.len());
+        let metrics: HashMap<String, Metric<String>> = units
+            .iter()
+            .zip(values.iter())
+            .map(|((unit_metric, unit), (value_metric, value))| {
+                assert_eq!(unit_metric, value_metric);
+                (
+                    unit_metric.clone(),
+                    Metric {
+                        value: optional!(value).cloned(),
+                        unit: optional!(unit).cloned(),
+                    },
+                )
+            })
+            .collect();
 
-    // this is kind of hacky..
-    let metrics = serde_json::to_string(&metrics)?;
-    let metrics: Metrics = serde_json::from_str(&metrics)?;
-    Ok(metrics)
+        // this is kind of hacky..
+        let metrics = serde_json::to_string(&metrics)?;
+        let metrics: M = serde_json::from_str(&metrics)?;
+        entries.push(metrics);
+    }
+    Ok(entries)
 }
 
 #[derive(Debug, Clone)]
 pub struct Options {}
 
-/// Profile test application using nvbprof profiler.
-///
-/// Note: The nvbprof compiler is not recommended for newer devices.
-///
-/// # Errors
-/// - When creating temp dir fails.
-/// - When profiling fails.
-/// - When application fails.
-#[allow(clippy::too_many_lines)]
-pub async fn nvprof<A>(
+pub async fn profile_all_metrics<A>(
+    nvprof: impl AsRef<Path>,
     executable: impl AsRef<Path>,
     args: A,
-    _options: &Options,
-) -> Result<ProfilingResult, Error>
+    log_file_path: impl AsRef<Path>,
+) -> Result<(String, Metrics), Error>
 where
     A: IntoIterator,
     <A as IntoIterator>::Item: AsRef<std::ffi::OsStr>,
 {
-    let tmp_dir = tempfile::tempdir()?;
-    let log_file_path = tmp_dir.path().join("log_file.csv");
-
-    let nvprof = which::which("nvprof").map_err(|_| Error::MissingProfiler("nvprof".into()));
-    let nvprof = nvprof.or_else(|_| {
-        let cuda = utils::find_cuda().ok_or(Error::MissingCUDA)?;
-        Ok::<_, Error>(cuda.join("bin/nvprof"))
-    })?;
-    let nvprof = nvprof
-        .canonicalize()
-        .map_err(|_| Error::MissingProfiler(nvprof))?;
-
-    let executable = executable
-        .as_ref()
-        .canonicalize()
-        .map_err(|_| Error::MissingExecutable(executable.as_ref().into()))?;
-
-    let mut cmd = Command::new(nvprof);
+    let mut cmd = async_process::Command::new(nvprof.as_ref());
     cmd.args([
         "--unified-memory-profiling",
         "off",
@@ -126,8 +127,8 @@ where
         "--csv",
         "--log-file",
     ])
-    .arg(&log_file_path)
-    .arg(&executable)
+    .arg(log_file_path.as_ref())
+    .arg(executable.as_ref())
     .args(args.into_iter());
 
     let result = cmd.output().await?;
@@ -141,17 +142,114 @@ where
 
     let mut log_reader = std::io::BufReader::new(log_file);
 
-    let mut original_log = String::new();
-    log_reader.read_to_string(&mut original_log)?;
-    log_reader.rewind()?;
+    let mut raw_log = String::new();
+    log_reader.read_to_string(&mut raw_log)?;
+
+    let mut log_reader = std::io::Cursor::new(&raw_log);
+    match parse_nvprof_csv(&mut log_reader) {
+        Err(source) => Err(Error::Parse { raw_log, source }),
+        Ok(metrics) if metrics.len() != 1 => Err(Error::Parse {
+            raw_log,
+            source: ParseError::MissingMetrics,
+        }),
+        Ok(mut metrics) => Ok((raw_log, metrics.remove(0))),
+    }
+}
+
+pub async fn profile_commands<A>(
+    nvprof: impl AsRef<Path>,
+    executable: impl AsRef<Path>,
+    args: A,
+    log_file_path: impl AsRef<Path>,
+) -> Result<(String, Vec<Command>), Error>
+where
+    A: IntoIterator,
+    <A as IntoIterator>::Item: AsRef<std::ffi::OsStr>,
+{
+    let mut cmd = async_process::Command::new(nvprof.as_ref());
+    cmd.args([
+        "--unified-memory-profiling",
+        "off",
+        "--concurrent-kernels",
+        "off",
+        "--print-gpu-trace",
+        "-u",
+        "us",
+        "--demangling",
+        "off",
+        "--csv",
+        "--log-file",
+    ])
+    .arg(log_file_path.as_ref())
+    .arg(executable.as_ref())
+    .args(args.into_iter());
 
-    let metrics = parse_nvprof_csv(&mut log_reader).map_err(|source| Error::Parse {
-        raw_log: original_log.clone(),
-        source,
+    let result = cmd.output().await?;
+    if !result.status.success() {
+        return Err(Error::Command(utils::CommandError::new(&cmd, result)));
+    }
+
+    let log_file = std::fs::OpenOptions::new()
+        .read(true)
+        .open(&log_file_path)?;
+
+    let mut log_reader = std::io::BufReader::new(log_file);
+
+    let mut raw_log = String::new();
+    log_reader.read_to_string(&mut raw_log)?;
+
+    let mut log_reader = std::io::Cursor::new(&raw_log);
+    match parse_nvprof_csv(&mut log_reader) {
+        Err(source) => Err(Error::Parse { raw_log, source }),
+        Ok(commands) => Ok((raw_log, commands)),
+    }
+}
+
+/// Profile test application using nvprof profiler.
+///
+/// Note: `nvprof` is not compatible with newer devices.
+///
+/// # Errors
+/// - When creating temp dir fails.
+/// - When profiling fails.
+/// - When application fails.
+pub async fn nvprof<A>(
+    executable: impl AsRef<Path>,
+    args: A,
+    _options: &Options,
+) -> Result<Output, Error>
+where
+    A: Clone + IntoIterator,
+    <A as IntoIterator>::Item: AsRef<std::ffi::OsStr>,
+{
+    let tmp_dir = tempfile::tempdir()?;
+    let log_file_path = tmp_dir.path().join("log_file.csv");
+
+    let nvprof = which::which("nvprof").map_err(|_| Error::MissingProfiler("nvprof".into()));
+    let nvprof = nvprof.or_else(|_| {
+        let cuda = utils::find_cuda().ok_or(Error::MissingCUDA)?;
+        Ok::<_, Error>(cuda.join("bin/nvprof"))
     })?;
-    Ok(ProfilingResult {
-        raw: original_log,
+    let nvprof = nvprof
+        .canonicalize()
+        .map_err(|_| Error::MissingProfiler(nvprof))?;
+
+    let executable = executable
+        .as_ref()
+        .canonicalize()
+        .map_err(|_| Error::MissingExecutable(executable.as_ref().into()))?;
+
+    let (raw_metrics_log, metrics) =
+        profile_all_metrics(&nvprof, &executable, args.clone(), &log_file_path).await?;
+
+    let (raw_commands_log, commands) =
+        profile_commands(&nvprof, &executable, args, &log_file_path).await?;
+
+    Ok(Output {
+        raw_metrics_log,
+        raw_commands_log,
         metrics,
+        commands,
     })
 }
 
@@ -159,6 +257,7 @@ where
 mod tests {
     use super::{parse_nvprof_csv, Metric};
     use color_eyre::eyre;
+    use similar_asserts as diff;
     use std::io::Cursor;
 
     #[test]
@@ -167,24 +266,90 @@ mod tests {
         let log = String::from_utf8_lossy(bytes).to_string();
         dbg!(&log);
         let mut log_reader = Cursor::new(bytes);
-        let metrics = parse_nvprof_csv(&mut log_reader)?;
+        let mut metrics: Vec<super::Metrics> = parse_nvprof_csv(&mut log_reader)?;
+        diff::assert_eq!(metrics.len(), 1);
+        let metrics = metrics.remove(0);
         dbg!(&metrics);
-        assert_eq!(
+        diff::assert_eq!(
             metrics.device,
             Metric::new("NVIDIA GeForce GTX 1080 (0)".to_string(), None)
         );
-        assert_eq!(
+        diff::assert_eq!(
             metrics.kernel,
             Metric::new("_Z6vecAddIfEvPT_S1_S1_i".to_string(), None)
         );
-        assert_eq!(metrics.context, Metric::new(1, None));
-        assert_eq!(metrics.stream, Metric::new(7, None));
-        assert_eq!(metrics.dram_write_bytes, Metric::new(0, None));
-        assert_eq!(metrics.dram_read_bytes, Metric::new(7136, None));
-        assert_eq!(metrics.dram_read_transactions, Metric::new(223, None));
-        assert_eq!(metrics.dram_write_transactions, Metric::new(0, None));
-        assert_eq!(metrics.l2_read_transactions, Metric::new(66, None));
-        assert_eq!(metrics.l2_write_transactions, Metric::new(26, None));
+        diff::assert_eq!(metrics.context, Metric::new(1, None));
+        diff::assert_eq!(metrics.stream, Metric::new(7, None));
+        diff::assert_eq!(metrics.dram_write_bytes, Metric::new(0, None));
+        diff::assert_eq!(metrics.dram_read_bytes, Metric::new(7136, None));
+        diff::assert_eq!(metrics.dram_read_transactions, Metric::new(223, None));
+        diff::assert_eq!(metrics.dram_write_transactions, Metric::new(0, None));
+        diff::assert_eq!(metrics.l2_read_transactions, Metric::new(66, None));
+        diff::assert_eq!(metrics.l2_write_transactions, Metric::new(26, None));
+        Ok(())
+    }
+
+    #[test]
+    fn parse_commands() -> eyre::Result<()> {
+        use super::metrics::Command;
+        let bytes = include_bytes!("../../tests/nvprof_vectoradd_100_32_commands.txt");
+        let log = String::from_utf8_lossy(bytes).to_string();
+        dbg!(&log);
+        let mut log_reader = Cursor::new(bytes);
+        let metrics: Vec<Command> = parse_nvprof_csv(&mut log_reader)?;
+        dbg!(&metrics);
+        diff::assert_eq!(metrics.len(), 5);
+
+        diff::assert_eq!(
+            have: metrics[0],
+            want: Command {
+                start: Metric::new(245729.104000, "us".to_string()),
+                duration: Metric::new(1.088000, "us".to_string()),
+                grid_x: Metric::new(None, None),
+                grid_y: Metric::new(None, None),
+                grid_z: Metric::new(None, None),
+                block_x: Metric::new(None, None),
+                block_y: Metric::new(None, None),
+                block_z: Metric::new(None, None),
+                registers_per_thread: Metric::new(None, None),
+                static_shared_memory: Metric::new(None, "B".to_string()),
+                dynamic_shared_memory: Metric::new(None, "B".to_string()),
+                size: Metric::new(400, "B".to_string()),
+                throughput: Metric::new(350.615557, "MB/s".to_string()),
+                src_mem_type: Metric::new("Pageable".to_string(), None),
+                dest_mem_type: Metric::new("Device".to_string(), None),
+                device: Metric::new("NVIDIA GeForce GTX 1080 (0)".to_string(), None),
+                context: Metric::new(1, None),
+                stream: Metric::new(7, None),
+                name: Metric::new("[CUDA memcpy HtoD]".to_string(), None),
+                correlation_id: Metric::new(117, None),
+            },
+        );
+        diff::assert_eq!(
+            have: metrics[3],
+            want: Command {
+                start: Metric::new(245767.824000, "us".to_string()),
+                duration: Metric::new(3.264000, "us".to_string()),
+                grid_x: Metric::new(1, None),
+                grid_y: Metric::new(1, None),
+                grid_z: Metric::new(1, None),
+                block_x: Metric::new(1024, None),
+                block_y: Metric::new(1, None),
+                block_z: Metric::new(1, None),
+                registers_per_thread: Metric::new(8, None),
+                static_shared_memory: Metric::new(0, "B".to_string()),
+                dynamic_shared_memory: Metric::new(0, "B".to_string()),
+                size: Metric::new(None, "B".to_string()),
+                throughput: Metric::new(None, "MB/s".to_string()),
+                src_mem_type: Metric::new(None, None),
+                dest_mem_type: Metric::new(None, None),
+                device: Metric::new("NVIDIA GeForce GTX 1080 (0)".to_string(), None),
+                context: Metric::new(1, None),
+                stream: Metric::new(7, None),
+                name: Metric::new("_Z6vecAddIfEvPT_S1_S1_i".to_string(), None),
+                correlation_id: Metric::new(123, None),
+            },
+        );
         Ok(())
     }
 }
diff --git a/profile/tests/nvprof_vectoradd_100_32_commands.txt b/profile/tests/nvprof_vectoradd_100_32_commands.txt
new file mode 100755
index 00000000..e52e2616
--- /dev/null
+++ b/profile/tests/nvprof_vectoradd_100_32_commands.txt
@@ -0,0 +1,10 @@
+==2424234== NVPROF is profiling process 2424234, command: /home/roman/dev/box/test-apps/vectoradd/vectoradd 100 32
+==2424234== Profiling application: /home/roman/dev/box/test-apps/vectoradd/vectoradd 100 32
+==2424234== Profiling result:
+"Start","Duration","Grid X","Grid Y","Grid Z","Block X","Block Y","Block Z","Registers Per Thread","Static SMem","Dynamic SMem","Size","Throughput","SrcMemType","DstMemType","Device","Context","Stream","Name","Correlation_ID"
+us,us,,,,,,,,B,B,B,MB/s,,,,,,,
+245729.104000,1.088000,,,,,,,,,,400,350.615557,"Pageable","Device","NVIDIA GeForce GTX 1080 (0)","1","7","[CUDA memcpy HtoD]",117
+245736.176000,0.672000,,,,,,,,,,400,567.663283,"Pageable","Device","NVIDIA GeForce GTX 1080 (0)","1","7","[CUDA memcpy HtoD]",119
+245742.384000,0.672000,,,,,,,,,,400,567.663283,"Pageable","Device","NVIDIA GeForce GTX 1080 (0)","1","7","[CUDA memcpy HtoD]",121
+245767.824000,3.264000,1,1,1,1024,1,1,8,0,0,,,,,"NVIDIA GeForce GTX 1080 (0)","1","7","_Z6vecAddIfEvPT_S1_S1_i",123
+245780.080000,1.152000,,,,,,,,,,400,331.136915,"Device","Pageable","NVIDIA GeForce GTX 1080 (0)","1","7","[CUDA memcpy DtoH]",125
diff --git a/stats/Cargo.toml b/stats/Cargo.toml
index e7722255..7a46b712 100644
--- a/stats/Cargo.toml
+++ b/stats/Cargo.toml
@@ -14,5 +14,6 @@ denylist = ["default"]
 indexmap = { version = "2", features = ["serde"] }
 serde = { version = "1", features = ["derive"] }
 strum = { version = "0", features = ["derive"] }
+utils = { path = "../utils" }
 
 [dev-dependencies]
diff --git a/stats/src/cache.rs b/stats/src/cache.rs
index bd8cae46..fa9fd6da 100644
--- a/stats/src/cache.rs
+++ b/stats/src/cache.rs
@@ -218,17 +218,14 @@ pub type PerCacheCsvRow = (usize, CsvRow);
 #[allow(clippy::module_name_repetitions)]
 #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub struct PerCache(pub Box<[Cache]>);
-// pub struct PerCache(pub indexmap::IndexMap<usize, Cache>);
-// pub struct PerCache(pub HashMap<usize, Cache>);
 
 impl PerCache {
     #[must_use]
     pub fn new(size: usize) -> Self {
-        Self(vec![Cache::default(); size].into_boxed_slice())
+        Self(utils::box_slice![Cache::default(); size])
     }
 
     #[must_use]
-    // pub fn into_inner(self) -> indexmap::IndexMap<usize, Cache> {
     pub fn into_inner(self) -> Box<[Cache]> {
         self.0
     }
diff --git a/stats/src/dram.rs b/stats/src/dram.rs
index 424f9481..e9c33d97 100644
--- a/stats/src/dram.rs
+++ b/stats/src/dram.rs
@@ -1,66 +1,124 @@
 use serde::{Deserialize, Serialize};
+use utils::box_slice;
 
-// use indexmap::IndexMap;
-// #[derive(Clone, Default, Debug, PartialEq, Eq, Serialize, Deserialize)]
-// pub struct JSONDRAM {
-//     /// bank writes [shader id][dram chip id][bank id]
-//     pub bank_writes: IndexMap<usize, IndexMap<usize, IndexMap<usize, u64>>>,
-//     /// bank reads [shader id][dram chip id][bank id]
-//     pub bank_reads: IndexMap<usize, IndexMap<usize, IndexMap<usize, u64>>>,
-//     /// bank writes [dram chip id][bank id]
-//     pub total_bank_writes: IndexMap<usize, IndexMap<usize, u64>>,
-//     /// bank reads [dram chip id][bank id]
-//     pub total_bank_reads: IndexMap<usize, IndexMap<usize, u64>>,
-// }
+#[derive(Clone, Default, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub struct BankAccessesCsvRow {
+    /// Core ID
+    core_id: usize,
+    /// DRAM chip ID
+    chip_id: usize,
+    /// Bank ID
+    bank_id: usize,
+    /// Number of reads
+    reads: u64,
+    /// Number of writes
+    writes: u64,
+}
 
 #[derive(Clone, Default, Debug, PartialEq, Eq, Serialize, Deserialize)]
-pub struct PerCoreDRAM {
-    /// bank writes [shader id][dram chip id][bank id]
-    pub bank_writes: Vec<Vec<Vec<u64>>>,
-    /// bank reads [shader id][dram chip id][bank id]
-    pub bank_reads: Vec<Vec<Vec<u64>>>,
+pub struct AccessesCsvRow {
+    /// DRAM chip ID
+    chip_id: usize,
+    /// Bank ID
+    bank_id: usize,
+    /// Number of reads
+    reads: u64,
+    /// Number of writes
+    writes: u64,
 }
 
 #[derive(Clone, Default, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub struct DRAM {
-    /// bank writes [shader id][dram chip id][bank id]
-    pub bank_writes: Vec<Vec<Vec<u64>>>,
-    /// bank reads [shader id][dram chip id][bank id]
-    pub bank_reads: Vec<Vec<Vec<u64>>>,
-    /// bank writes [dram chip id][bank id]
-    pub total_bank_writes: Vec<Vec<u64>>,
-    /// bank reads [dram chip id][bank id]
-    pub total_bank_reads: Vec<Vec<u64>>,
+    /// Number of bank writes [shader id][dram chip id][bank id]
+    pub bank_writes: Box<[Box<[Box<[u64]>]>]>,
+    /// Number of bank reads [shader id][dram chip id][bank id]
+    pub bank_reads: Box<[Box<[Box<[u64]>]>]>,
+    /// Number of bank writes [dram chip id][bank id]
+    pub total_bank_writes: Box<[Box<[u64]>]>,
+    /// Number of bank reads [dram chip id][bank id]
+    pub total_bank_reads: Box<[Box<[u64]>]>,
+
+    /// Number of cores
+    pub num_cores: usize,
+    /// Number of DRAM chips
+    pub num_chips: usize,
+    /// Number of banks
+    pub num_banks: usize,
 }
 
 impl DRAM {
     #[must_use]
     pub fn new(num_total_cores: usize, num_mem_units: usize, num_banks: usize) -> Self {
-        let total_bank_writes = vec![vec![0; num_banks]; num_mem_units];
+        let total_bank_writes = box_slice![box_slice![0; num_banks]; num_mem_units];
         let total_bank_reads = total_bank_writes.clone();
-        let bank_reads = vec![total_bank_reads.clone(); num_total_cores];
+        let bank_reads = box_slice![total_bank_reads.clone(); num_total_cores];
         let bank_writes = bank_reads.clone();
         Self {
             bank_writes,
             bank_reads,
             total_bank_writes,
             total_bank_reads,
+            num_banks,
+            num_cores: num_total_cores,
+            num_chips: num_mem_units,
+        }
+    }
+
+    #[must_use]
+    pub fn bank_accesses_csv(&self) -> Vec<BankAccessesCsvRow> {
+        let mut out = Vec::new();
+        for core_id in 0..self.num_cores {
+            for chip_id in 0..self.num_chips {
+                for bank_id in 0..self.num_banks {
+                    let reads = self.bank_reads[core_id][chip_id][bank_id];
+                    let writes = self.bank_writes[core_id][chip_id][bank_id];
+                    out.push(BankAccessesCsvRow {
+                        core_id,
+                        chip_id,
+                        bank_id,
+                        reads,
+                        writes,
+                    });
+                }
+            }
         }
+        out
     }
 
     #[must_use]
-    pub fn flatten(self) -> Self {
-        todo!("flatten dram stats");
+    pub fn accesses_csv(&self) -> Vec<AccessesCsvRow> {
+        let mut out = Vec::new();
+        for chip_id in 0..self.num_chips {
+            for bank_id in 0..self.num_banks {
+                let reads = self.total_bank_reads[chip_id][bank_id];
+                let writes = self.total_bank_writes[chip_id][bank_id];
+                out.push(AccessesCsvRow {
+                    chip_id,
+                    bank_id,
+                    reads,
+                    writes,
+                });
+            }
+        }
+        out
     }
 
     #[must_use]
     pub fn total_reads(&self) -> u64 {
-        self.total_bank_reads.iter().flatten().sum()
+        self.total_bank_reads
+            .iter()
+            .map(AsRef::as_ref)
+            .flatten()
+            .sum()
     }
 
     #[must_use]
     pub fn total_writes(&self) -> u64 {
-        self.total_bank_writes.iter().flatten().sum()
+        self.total_bank_writes
+            .iter()
+            .map(AsRef::as_ref)
+            .flatten()
+            .sum()
     }
 
     // #[must_use]
diff --git a/test-apps/test-apps-materialized.yml b/test-apps/test-apps-materialized.yml
index cdbd6c5c..799c2d56 100755
--- a/test-apps/test-apps-materialized.yml
+++ b/test-apps/test-apps-materialized.yml
@@ -2,7 +2,7 @@
 ##
 ## AUTO GENERATED! DO NOT EDIT
 ##
-## this configuration was materialized from /home/roman/dev/box/test-apps/test-apps.yml on 23/08/2023 14:44:48
+## this configuration was materialized from /home/roman/dev/box/test-apps/test-apps.yml on 01/09/2023 01:38:57
 ##
 
 config:
diff --git a/utils/src/lib.rs b/utils/src/lib.rs
index 1aba5a85..79acd029 100644
--- a/utils/src/lib.rs
+++ b/utils/src/lib.rs
@@ -91,6 +91,19 @@ macro_rules! decode_utf8 {
     };
 }
 
+#[macro_export]
+macro_rules! box_slice {
+    () => (
+        std::vec::Vec::new().into_boxed_slice()
+    );
+    ($elem:expr; $n:expr) => (
+        std::vec::from_elem($elem, $n).into_boxed_slice()
+    );
+    ($($x:expr),+ $(,)?) => (
+        std::vec![$($x),+].into_boxed_slice()
+    );
+}
+
 impl CommandError {
     pub fn into_eyre(self) -> eyre::Report {
         let command_section = self.command.clone().header("command:");
diff --git a/validate/src/accelsim.rs b/validate/src/accelsim.rs
index a29520da..8613f5ca 100644
--- a/validate/src/accelsim.rs
+++ b/validate/src/accelsim.rs
@@ -177,10 +177,12 @@ pub async fn simulate(
     // let flat_stats: Vec<_> = stats.into_inner().into_iter().collect();
     // serde_json::to_writer_pretty(open_writable(&stats_out_file)?, &flat_stats)?;
 
-    serde_json::to_writer_pretty(
-        open_writable(stats_dir.join("exec_time.json"))?,
-        &dur.as_millis(),
-    )
-    .map_err(eyre::Report::from)?;
+    #[cfg(debug_assertions)]
+    let exec_time_file_path = stats_dir.join("exec_time.debug.json");
+    #[cfg(not(debug_assertions))]
+    let exec_time_file_path = stats_dir.join("exec_time.release.json");
+
+    serde_json::to_writer_pretty(open_writable(exec_time_file_path)?, &dur.as_millis())
+        .map_err(eyre::Report::from)?;
     Ok(())
 }
diff --git a/validate/src/benchmark/matrix.rs b/validate/src/benchmark/matrix.rs
index 72c2be36..464b18fc 100644
--- a/validate/src/benchmark/matrix.rs
+++ b/validate/src/benchmark/matrix.rs
@@ -181,9 +181,6 @@ pub fn expand(inputs: &Inputs, includes: &Includes, excludes: &Excludes) -> Vec<
             let intersecting_entries: Vec<_> =
                 current_entries.intersection(&include_entries).collect();
 
-            dbg!(&intersecting_keys);
-            dbg!(&intersecting_entries);
-
             assert!(!current.is_empty());
             if intersecting_keys.is_empty() {
                 // does not overwrite anything: extend combination
diff --git a/validate/src/options.rs b/validate/src/options.rs
index 14d3d123..3a2f6513 100644
--- a/validate/src/options.rs
+++ b/validate/src/options.rs
@@ -52,7 +52,12 @@ pub struct Options {
     #[clap(short = 'b', long = "bench", help = "name of benchmark to run")]
     pub selected_benchmarks: Vec<String>,
 
-    #[clap(long = "force", help = "force re-run", default_value = "false")]
+    #[clap(
+        short = 'f',
+        long = "force",
+        help = "force re-run",
+        default_value = "false"
+    )]
     pub force: bool,
 
     #[clap(long = "fail-fast", help = "fail fast", default_value = "false")]
diff --git a/validate/src/playground.rs b/validate/src/playground.rs
index 3874fb07..9dbb7b0f 100644
--- a/validate/src/playground.rs
+++ b/validate/src/playground.rs
@@ -92,11 +92,16 @@ pub async fn simulate(
 
     create_dirs(&stats_dir).map_err(eyre::Report::from)?;
     let _stats_out_file = stats_dir.join("stats.json");
-    let exec_dur_file = stats_dir.join("exec_time.json");
 
     // let flat_stats: Vec<_> = stats.into_iter().collect();
     // serde_json::to_writer_pretty(open_writable(&stats_out_file)?, &flat_stats)?;
-    serde_json::to_writer_pretty(open_writable(exec_dur_file)?, &dur.as_millis())
+
+    #[cfg(debug_assertions)]
+    let exec_time_file_path = stats_dir.join("exec_time.debug.json");
+    #[cfg(not(debug_assertions))]
+    let exec_time_file_path = stats_dir.join("exec_time.release.json");
+
+    serde_json::to_writer_pretty(open_writable(exec_time_file_path)?, &dur.as_millis())
         .map_err(eyre::Report::from)?;
     Ok(())
 }
diff --git a/validate/src/profile.rs b/validate/src/profile.rs
index ebd43149..3b3888f9 100644
--- a/validate/src/profile.rs
+++ b/validate/src/profile.rs
@@ -6,6 +6,7 @@ use crate::{
 };
 use color_eyre::eyre;
 use std::io::Write;
+use std::path::Path;
 use utils::fs::create_dirs;
 
 pub async fn profile(
@@ -14,28 +15,45 @@ pub async fn profile(
     _trace_opts: &options::Profile,
 ) -> Result<(), RunError> {
     let profile_dir = &bench.profile.profile_dir;
-    // dbg!(&profile_dir);
     create_dirs(profile_dir).map_err(eyre::Report::from)?;
 
-    let log_file = profile_dir.join("profile.log");
-    let metrics_file = profile_dir.join("profile.metrics.csv");
+    let metrics_log_file = profile_dir.join("profile.nvprof.metrics.log");
+    let commands_log_file = profile_dir.join("profile.nvprof.commands.log");
+    let metrics_file_json = profile_dir.join("profile.metrics.json");
+    let commands_file_json = profile_dir.join("profile.commands.json");
 
-    if !options.force && log_file.is_file() && metrics_file.is_file() {
+    if !options.force
+        && [
+            metrics_log_file.as_path(),
+            commands_log_file.as_path(),
+            metrics_file_json.as_path(),
+            commands_file_json.as_path(),
+        ]
+        .into_iter()
+        .all(Path::is_file)
+    {
         return Err(RunError::Skipped);
     }
 
     let options = profile::nvprof::Options {};
-    let results = profile::nvprof::nvprof(&bench.executable, &bench.args, &options)
+    let output = profile::nvprof::nvprof(&bench.executable, &bench.args, &options)
         .await
         .map_err(|err| match err {
             profile::Error::Command(err) => err.into_eyre(),
             err => err.into(),
         })?;
 
-    serde_json::to_writer_pretty(open_writable(&metrics_file)?, &results.metrics)
+    open_writable(&metrics_log_file)?
+        .write_all(output.raw_metrics_log.as_bytes())
         .map_err(eyre::Report::from)?;
-    open_writable(&log_file)?
-        .write_all(results.raw.as_bytes())
+    open_writable(&commands_log_file)?
+        .write_all(output.raw_commands_log.as_bytes())
         .map_err(eyre::Report::from)?;
+
+    serde_json::to_writer_pretty(open_writable(&metrics_file_json)?, &output.metrics)
+        .map_err(eyre::Report::from)?;
+    serde_json::to_writer_pretty(open_writable(&commands_file_json)?, &output.commands)
+        .map_err(eyre::Report::from)?;
+
     Ok(())
 }
diff --git a/validate/src/simulate.rs b/validate/src/simulate.rs
index a4e109e9..640bd286 100644
--- a/validate/src/simulate.rs
+++ b/validate/src/simulate.rs
@@ -94,11 +94,13 @@ pub async fn simulate(
 
     crate::stats::write_stats_as_csv(&stats_dir, stats)?;
 
-    serde_json::to_writer_pretty(
-        open_writable(stats_dir.join("exec_time.json"))?,
-        &dur.as_millis(),
-    )
-    .map_err(eyre::Report::from)?;
+    #[cfg(debug_assertions)]
+    let exec_time_file_path = stats_dir.join("exec_time.debug.json");
+    #[cfg(not(debug_assertions))]
+    let exec_time_file_path = stats_dir.join("exec_time.release.json");
+
+    serde_json::to_writer_pretty(open_writable(exec_time_file_path)?, &dur.as_millis())
+        .map_err(eyre::Report::from)?;
 
     // let json_stats: stats::FlatStats = stats.clone().into();
     // let json_stats_out_file = stats_dir.join("stats.json");
diff --git a/validate/src/stats.rs b/validate/src/stats.rs
index d9fb5839..f617c0f3 100644
--- a/validate/src/stats.rs
+++ b/validate/src/stats.rs
@@ -62,23 +62,32 @@ pub fn write_csv_rows(
 
 pub fn write_stats_as_csv(stats_dir: impl AsRef<Path>, stats: stats::Stats) -> eyre::Result<()> {
     let stats_dir = stats_dir.as_ref();
+    // sim stats
     write_csv_rows(open_writable(sim_stats_path(stats_dir))?, &[stats.sim])?;
-    // validate::write_csv_rows(
-    //     open_writable(stats_dir.join("stats.dram.csv"))?,
-    //     &[stats::dram::PerCoreDRAM {
-    //         bank_writes: stats.dram.bank_writes,
-    //         bank_reads: stats.dram.bank_reads,
-    //     }],
-    // )?;
+
+    // dram stats
+    write_csv_rows(
+        open_writable(stats_dir.join("stats.dram.csv"))?,
+        &stats.dram.accesses_csv(),
+    )?;
+    write_csv_rows(
+        open_writable(stats_dir.join("stats.dram.banks.csv"))?,
+        &stats.dram.bank_accesses_csv(),
+    )?;
+
+    // access stats
     write_csv_rows(
         open_writable(access_stats_path(stats_dir))?,
         &stats.accesses.flatten(),
     )?;
+
+    // instruction stats
     write_csv_rows(
         open_writable(instruction_stats_path(stats_dir))?,
         &stats.instructions.flatten(),
     )?;
 
+    // cache stats
     for (cache, rows) in [
         (Cache::L1I, stats.l1i_stats.flatten()),
         (Cache::L1D, stats.l1d_stats.flatten()),