From e774b294a8c05aefb7a9db71e9d85d06ae9ff602 Mon Sep 17 00:00:00 2001
From: Niklas Koehnecke <niklas.koehnecke.ext@aleph-alpha.com>
Date: Thu, 28 Mar 2024 15:52:36 +0100
Subject: [PATCH] wip

---
 src/examples/test.ipynb                       | 299 ++++++++++++++++++
 .../evaluation/dataset/domain.py              |   7 +
 .../evaluation/evaluation/domain.py           |   8 +
 .../infrastructure/repository_navigator.py    |  18 ++
 .../evaluation/run/domain.py                  |   8 +
 5 files changed, 340 insertions(+)
 create mode 100644 src/examples/test.ipynb

diff --git a/src/examples/test.ipynb b/src/examples/test.ipynb
new file mode 100644
index 000000000..fd6c7d650
--- /dev/null
+++ b/src/examples/test.ipynb
@@ -0,0 +1,299 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intelligence_layer.evaluation.dataset.domain import Example\n",
+    "from intelligence_layer.evaluation.run.domain import ExampleOutput\n",
+    "from intelligence_layer.evaluation.infrastructure.repository_navigator import RunLineage, EvaluationLineage, RepositoryNavigator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'tuple' object has no attribute '_rich_render'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/intelligence-layer-d3iSWYpm-py3.10/lib/python3.10/site-packages/IPython/core/formatters.py:925\u001b[0m, in \u001b[0;36mIPythonDisplayFormatter.__call__\u001b[0;34m(self, obj)\u001b[0m\n\u001b[1;32m    923\u001b[0m method \u001b[38;5;241m=\u001b[39m get_real_method(obj, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprint_method)\n\u001b[1;32m    924\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m method \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 925\u001b[0m     \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    926\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
+      "File \u001b[0;32m~/aleph/intelligence-layer/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py:43\u001b[0m, in \u001b[0;36mEvaluationLineage._ipython_display_\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     41\u001b[0m output_tree \u001b[38;5;241m=\u001b[39m Tree(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOutputs:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     42\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m output \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutputs:\n\u001b[0;32m---> 43\u001b[0m     output_tree\u001b[38;5;241m.\u001b[39madd(\u001b[43moutput\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_rich_render\u001b[49m(skip_example_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m))\n\u001b[1;32m     44\u001b[0m tree\u001b[38;5;241m.\u001b[39madd(output_tree)\n\u001b[1;32m     45\u001b[0m tree\u001b[38;5;241m.\u001b[39madd(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mevaluation\u001b[38;5;241m.\u001b[39m_rich_render(skip_example_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m))\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'tuple' object has no attribute '_rich_render'"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "EvaluationLineage(example=Example ID = example-id\n",
+       "Input = input0\n",
+       "Expected output = \"expected_output0\"\n",
+       ", outputs=Example ID=example-id\n",
+       "Related Run ID=run-id\n",
+       "Output=\"output\"\n",
+       ", evaluation=Evaluation ID = eval-id\n",
+       "Example ID = example-id\n",
+       "Result = eval='Result'\n",
+       ")"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from pydantic import BaseModel\n",
+    "from intelligence_layer.evaluation.evaluation.domain import ExampleEvaluation\n",
+    "\n",
+    "outputs = [ExampleOutput(run_id=\"run-id\", example_id=\"example-id\", output=\"output\"), ExampleOutput(run_id=\"run-id\", example_id=\"example-id\", output=\"output\")]\n",
+    "lineage = EvaluationLineage(example=Example(input=\"input0\", expected_output=\"expected_output0\", data=\"data0\", id=\"example-id\"), \n",
+    "                     outputs=ExampleOutput(run_id=\"run-id\", example_id=\"example-id\", output=\"output\"),\n",
+    "                     evaluation=ExampleEvaluation(evaluation_id=\"eval-id\", example_id=\"example-id\", result=DummyEval(eval=\"Result\")))\n",
+    "lineage"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValidationError",
+     "evalue": "1 validation error for RunLineage\noutput\n  Input should be a valid dictionary or instance of ExampleOutput [type=model_type, input_value=[Example ID=example-id\nRe...run-id\nOutput=\"output\"\n], input_type=list]\n    For further information visit https://errors.pydantic.dev/2.6/v/model_type",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValidationError\u001b[0m                           Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[13], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m lineage \u001b[38;5;241m=\u001b[39m \u001b[43mRunLineage\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexample\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mExample\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43minput0\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexpected_output\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexpected_output0\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdata0\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mid\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexample-id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m      2\u001b[0m \u001b[43m                     \u001b[49m\u001b[43moutput\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mExampleOutput\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrun_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrun-id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexample-id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43moutput\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      3\u001b[0m lineage\n",
+      "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/intelligence-layer-d3iSWYpm-py3.10/lib/python3.10/site-packages/pydantic/main.py:171\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(self, **data)\u001b[0m\n\u001b[1;32m    169\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m    170\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mValidationError\u001b[0m: 1 validation error for RunLineage\noutput\n  Input should be a valid dictionary or instance of ExampleOutput [type=model_type, input_value=[Example ID=example-id\nRe...run-id\nOutput=\"output\"\n], input_type=list]\n    For further information visit https://errors.pydantic.dev/2.6/v/model_type"
+     ]
+    }
+   ],
+   "source": [
+    "lineage = RunLineage(example=Example(input=\"input0\", expected_output=\"expected_output0\", data=\"data0\", id=\"example-id\"), \n",
+    "                     output=[ExampleOutput(run_id=\"run-id\", example_id=\"example-id\", output=\"output\")])\n",
+    "lineage"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValidationError",
+     "evalue": "1 validation error for EvaluationLineage\noutputs\n  Input should be a valid dictionary or instance of ExampleOutput [type=model_type, input_value=[Example ID=example-id\nRe...run-id\nOutput=\"output\"\n], input_type=list]\n    For further information visit https://errors.pydantic.dev/2.6/v/model_type",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValidationError\u001b[0m                           Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[4], line 7\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[38;5;28meval\u001b[39m: \u001b[38;5;28mstr\u001b[39m\n\u001b[1;32m      6\u001b[0m outputs \u001b[38;5;241m=\u001b[39m [ExampleOutput(run_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrun-id\u001b[39m\u001b[38;5;124m\"\u001b[39m, example_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mexample-id\u001b[39m\u001b[38;5;124m\"\u001b[39m, output\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput\u001b[39m\u001b[38;5;124m\"\u001b[39m), ExampleOutput(run_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrun-id\u001b[39m\u001b[38;5;124m\"\u001b[39m, example_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mexample-id\u001b[39m\u001b[38;5;124m\"\u001b[39m, output\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput\u001b[39m\u001b[38;5;124m\"\u001b[39m)]\n\u001b[0;32m----> 7\u001b[0m lineage \u001b[38;5;241m=\u001b[39m \u001b[43mEvaluationLineage\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexample\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mExample\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43minput0\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexpected_output\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexpected_output0\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdata0\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mid\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexample-id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m      8\u001b[0m \u001b[43m                     \u001b[49m\u001b[43moutputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mExampleOutput\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrun_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrun-id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexample-id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43moutput\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      9\u001b[0m \u001b[43m                     \u001b[49m\u001b[43mevaluation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mExampleEvaluation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevaluation_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43meval-id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexample-id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mDummyEval\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43meval\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mResult\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     10\u001b[0m lineage\n",
+      "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/intelligence-layer-d3iSWYpm-py3.10/lib/python3.10/site-packages/pydantic/main.py:171\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(self, **data)\u001b[0m\n\u001b[1;32m    169\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m    170\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mValidationError\u001b[0m: 1 validation error for EvaluationLineage\noutputs\n  Input should be a valid dictionary or instance of ExampleOutput [type=model_type, input_value=[Example ID=example-id\nRe...run-id\nOutput=\"output\"\n], input_type=list]\n    For further information visit https://errors.pydantic.dev/2.6/v/model_type"
+     ]
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>example</th>\n",
+       "      <th>output</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Example ID = example-id\\nInput = input0\\nExpec...</td>\n",
+       "      <td>Example ID=example-id\\nRelated Run ID=run-id\\n...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                             example                                             output\n",
+       "0  Example ID = example-id\\nInput = input0\\nExpec...  Example ID=example-id\\nRelated Run ID=run-id\\n..."
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.DataFrame([lineage.__dict__])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "Repository does not contain a run with id: non-existent",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[4], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mintelligence_layer\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mevaluation\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m InMemoryRunRepository\n\u001b[1;32m      2\u001b[0m repo \u001b[38;5;241m=\u001b[39m InMemoryRunRepository()\n\u001b[0;32m----> 3\u001b[0m \u001b[43mrepo\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexample_outputs\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnon-existent\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/aleph/intelligence-layer/src/intelligence_layer/evaluation/run/in_memory_run_repository.py:76\u001b[0m, in \u001b[0;36mInMemoryRunRepository.example_outputs\u001b[0;34m(self, run_id, output_type)\u001b[0m\n\u001b[1;32m     72\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mexample_outputs\u001b[39m(\n\u001b[1;32m     73\u001b[0m     \u001b[38;5;28mself\u001b[39m, run_id: \u001b[38;5;28mstr\u001b[39m, output_type: \u001b[38;5;28mtype\u001b[39m[Output]\n\u001b[1;32m     74\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Iterable[ExampleOutput[Output]]:\n\u001b[1;32m     75\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m run_id \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_run_overviews\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[0;32m---> 76\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRepository does not contain a run with id: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrun_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     78\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m (\n\u001b[1;32m     79\u001b[0m         cast(ExampleOutput[Output], example_output)\n\u001b[1;32m     80\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m example_output \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28msorted\u001b[39m(\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     83\u001b[0m         )\n\u001b[1;32m     84\u001b[0m     )\n",
+      "\u001b[0;31mValueError\u001b[0m: Repository does not contain a run with id: non-existent"
+     ]
+    }
+   ],
+   "source": [
+    "from intelligence_layer.evaluation import InMemoryRunRepository\n",
+    "repo = InMemoryRunRepository()\n",
+    "repo.example_outputs(\"non-existent\", None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# BODY_SIZE_LIMIT=\"Infinity\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<intelligence_layer.core.tracer.file_tracer.FileTracer at 0x7fb910847dc0>"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from pathlib import Path\n",
+    "from intelligence_layer.core.tracer.file_tracer import FileTracer\n",
+    "from intelligence_layer.core.tracer.in_memory_tracer import InMemoryTracer\n",
+    "tracer = FileTracer(Path(\"temp.jsonl\"))\n",
+    "#tracer = InMemoryTracer()\n",
+    "data = [\"this is my data\"* 10] * 10 \n",
+    "for i in range(100):\n",
+    "    with tracer.span(\"test\") as t:\n",
+    "        t.log(\"message\", data)\n",
+    "\n",
+    "tracer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "intelligence-layer-d3iSWYpm-py3.10",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/intelligence_layer/evaluation/dataset/domain.py b/src/intelligence_layer/evaluation/dataset/domain.py
index a61ff0ce0..e91fafd37 100644
--- a/src/intelligence_layer/evaluation/dataset/domain.py
+++ b/src/intelligence_layer/evaluation/dataset/domain.py
@@ -2,6 +2,7 @@
 from uuid import uuid4
 
 from pydantic import BaseModel, Field
+from rich.tree import Tree
 
 from intelligence_layer.core.task import Input
 from intelligence_layer.core.tracer.tracer import PydanticSerializable
@@ -40,6 +41,12 @@ def __str__(self) -> str:
             f'Expected output = "{self.expected_output}"\n'
         )
 
+    def _rich_render(self):
+        example_tree = Tree(f"Example: {self.id}")
+        example_tree.add(str(self.input))
+        example_tree.add(str(self.expected_output))
+        return example_tree
+
 
 class Dataset(BaseModel):
     """Represents a dataset linked to multiple examples
diff --git a/src/intelligence_layer/evaluation/evaluation/domain.py b/src/intelligence_layer/evaluation/evaluation/domain.py
index 6883be626..da0700fb4 100644
--- a/src/intelligence_layer/evaluation/evaluation/domain.py
+++ b/src/intelligence_layer/evaluation/evaluation/domain.py
@@ -2,6 +2,7 @@
 from typing import Generic, Optional, TypeVar
 
 from pydantic import BaseModel, SerializeAsAny
+from rich.tree import Tree
 
 from intelligence_layer.evaluation.run.domain import RunOverview
 
@@ -55,6 +56,13 @@ def __str__(self) -> str:
             f"Result = {self.result}\n"
         )
 
+    def _rich_render(self, skip_example_id: bool = False):
+        tree = Tree(f"Evaluation: {self.evaluation_id}")
+        if not skip_example_id:
+            tree.add(str(self.example_id))
+        tree.add(str(self.result))
+        return tree
+
 
 class EvaluationOverview(BaseModel, frozen=True):
     """Overview of the un-aggregated results of evaluating a :class:`Task` on a dataset.
diff --git a/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py b/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py
index ca3cf7ba5..944d660ea 100644
--- a/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py
+++ b/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py
@@ -1,7 +1,9 @@
 import itertools
 from typing import Generic, Iterable, Sequence
 
+import rich
 from pydantic import BaseModel
+from rich.tree import Tree
 
 from intelligence_layer.core.task import Input, Output
 from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository
@@ -21,12 +23,28 @@ class RunLineage(BaseModel, Generic[Input, ExpectedOutput, Output]):
     example: Example[Input, ExpectedOutput]
     output: ExampleOutput[Output]
 
+    def _ipython_display_(self):
+        tree = Tree("Run Lineage")
+        tree.add(self.example._rich_render())
+        tree.add(self.output._rich_render(skip_example_id=True))
+        rich.print(tree)
+
 
 class EvaluationLineage(BaseModel, Generic[Input, ExpectedOutput, Output, Evaluation]):
     example: Example[Input, ExpectedOutput]
     outputs: Sequence[ExampleOutput[Output]]
     evaluation: ExampleEvaluation[Evaluation]
 
+    def _ipython_display_(self):
+        tree = Tree("Run Lineage")
+        tree.add(self.example._rich_render())
+        output_tree = Tree("Outputs:")
+        for output in self.outputs:
+            output_tree.add(output._rich_render(skip_example_id=True))
+        tree.add(output_tree)
+        tree.add(self.evaluation._rich_render(skip_example_id=True))
+        rich.print(tree)
+
 
 class RepositoryNavigator:
     """The `RepositoryNavigator` is used to retrieve coupled data from multiple repositories."""
diff --git a/src/intelligence_layer/evaluation/run/domain.py b/src/intelligence_layer/evaluation/run/domain.py
index 20a6ef296..cbd1e2dce 100644
--- a/src/intelligence_layer/evaluation/run/domain.py
+++ b/src/intelligence_layer/evaluation/run/domain.py
@@ -2,6 +2,7 @@
 from typing import Generic
 
 from pydantic import BaseModel
+from rich.tree import Tree
 
 from intelligence_layer.core.task import Output
 
@@ -49,6 +50,13 @@ def __str__(self) -> str:
             f'Output="{self.output}"\n'
         )
 
+    def _rich_render(self, skip_example_id: bool = False):
+        tree = Tree(f"Output: {self.run_id}")
+        if not skip_example_id:
+            tree.add(str(self.example_id))
+        tree.add(str(self.output))
+        return tree
+
 
 class SuccessfulExampleOutput(BaseModel, Generic[Output]):
     """Successful output of a single evaluated :class:`Example`