From e774b294a8c05aefb7a9db71e9d85d06ae9ff602 Mon Sep 17 00:00:00 2001 From: Niklas Koehnecke Date: Thu, 28 Mar 2024 15:52:36 +0100 Subject: [PATCH] wip --- src/examples/test.ipynb | 299 ++++++++++++++++++ .../evaluation/dataset/domain.py | 7 + .../evaluation/evaluation/domain.py | 8 + .../infrastructure/repository_navigator.py | 18 ++ .../evaluation/run/domain.py | 8 + 5 files changed, 340 insertions(+) create mode 100644 src/examples/test.ipynb diff --git a/src/examples/test.ipynb b/src/examples/test.ipynb new file mode 100644 index 000000000..fd6c7d650 --- /dev/null +++ b/src/examples/test.ipynb @@ -0,0 +1,299 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from intelligence_layer.evaluation.dataset.domain import Example\n", + "from intelligence_layer.evaluation.run.domain import ExampleOutput\n", + "from intelligence_layer.evaluation.infrastructure.repository_navigator import RunLineage, EvaluationLineage, RepositoryNavigator" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'tuple' object has no attribute '_rich_render'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/intelligence-layer-d3iSWYpm-py3.10/lib/python3.10/site-packages/IPython/core/formatters.py:925\u001b[0m, in \u001b[0;36mIPythonDisplayFormatter.__call__\u001b[0;34m(self, obj)\u001b[0m\n\u001b[1;32m 923\u001b[0m method \u001b[38;5;241m=\u001b[39m get_real_method(obj, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprint_method)\n\u001b[1;32m 924\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m method \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 925\u001b[0m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 926\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n", + "File \u001b[0;32m~/aleph/intelligence-layer/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py:43\u001b[0m, in \u001b[0;36mEvaluationLineage._ipython_display_\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 41\u001b[0m output_tree \u001b[38;5;241m=\u001b[39m Tree(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOutputs:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m output \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutputs:\n\u001b[0;32m---> 43\u001b[0m output_tree\u001b[38;5;241m.\u001b[39madd(\u001b[43moutput\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_rich_render\u001b[49m(skip_example_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m))\n\u001b[1;32m 44\u001b[0m tree\u001b[38;5;241m.\u001b[39madd(output_tree)\n\u001b[1;32m 45\u001b[0m tree\u001b[38;5;241m.\u001b[39madd(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mevaluation\u001b[38;5;241m.\u001b[39m_rich_render(skip_example_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m))\n", + "\u001b[0;31mAttributeError\u001b[0m: 'tuple' object has no attribute '_rich_render'" + ] + }, + { + "data": { + "text/plain": [ + "EvaluationLineage(example=Example ID = example-id\n", + "Input = input0\n", + "Expected output = \"expected_output0\"\n", + ", outputs=Example ID=example-id\n", + "Related Run ID=run-id\n", + "Output=\"output\"\n", + ", evaluation=Evaluation ID = eval-id\n", + "Example ID = example-id\n", + "Result = eval='Result'\n", + ")" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pydantic import BaseModel\n", + "from intelligence_layer.evaluation.evaluation.domain import ExampleEvaluation\n", + "\n", + "outputs = [ExampleOutput(run_id=\"run-id\", example_id=\"example-id\", output=\"output\"), ExampleOutput(run_id=\"run-id\", example_id=\"example-id\", output=\"output\")]\n", + "lineage = EvaluationLineage(example=Example(input=\"input0\", expected_output=\"expected_output0\", data=\"data0\", id=\"example-id\"), \n", + " outputs=ExampleOutput(run_id=\"run-id\", example_id=\"example-id\", output=\"output\"),\n", + " evaluation=ExampleEvaluation(evaluation_id=\"eval-id\", example_id=\"example-id\", result=DummyEval(eval=\"Result\")))\n", + "lineage" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "ename": "ValidationError", + "evalue": "1 validation error for RunLineage\noutput\n Input should be a valid dictionary or instance of ExampleOutput [type=model_type, input_value=[Example ID=example-id\nRe...run-id\nOutput=\"output\"\n], input_type=list]\n For further information visit https://errors.pydantic.dev/2.6/v/model_type", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[13], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m lineage \u001b[38;5;241m=\u001b[39m \u001b[43mRunLineage\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexample\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mExample\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43minput0\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexpected_output\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexpected_output0\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdata0\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mid\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexample-id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mExampleOutput\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrun_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrun-id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexample-id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43moutput\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m lineage\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/intelligence-layer-d3iSWYpm-py3.10/lib/python3.10/site-packages/pydantic/main.py:171\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(self, **data)\u001b[0m\n\u001b[1;32m 169\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m 170\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mValidationError\u001b[0m: 1 validation error for RunLineage\noutput\n Input should be a valid dictionary or instance of ExampleOutput [type=model_type, input_value=[Example ID=example-id\nRe...run-id\nOutput=\"output\"\n], input_type=list]\n For further information visit https://errors.pydantic.dev/2.6/v/model_type" + ] + } + ], + "source": [ + "lineage = RunLineage(example=Example(input=\"input0\", expected_output=\"expected_output0\", data=\"data0\", id=\"example-id\"), \n", + " output=[ExampleOutput(run_id=\"run-id\", example_id=\"example-id\", output=\"output\")])\n", + "lineage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "ValidationError", + "evalue": "1 validation error for EvaluationLineage\noutputs\n Input should be a valid dictionary or instance of ExampleOutput [type=model_type, input_value=[Example ID=example-id\nRe...run-id\nOutput=\"output\"\n], input_type=list]\n For further information visit https://errors.pydantic.dev/2.6/v/model_type", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 7\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28meval\u001b[39m: \u001b[38;5;28mstr\u001b[39m\n\u001b[1;32m 6\u001b[0m outputs \u001b[38;5;241m=\u001b[39m [ExampleOutput(run_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrun-id\u001b[39m\u001b[38;5;124m\"\u001b[39m, example_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mexample-id\u001b[39m\u001b[38;5;124m\"\u001b[39m, output\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput\u001b[39m\u001b[38;5;124m\"\u001b[39m), ExampleOutput(run_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrun-id\u001b[39m\u001b[38;5;124m\"\u001b[39m, example_id\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mexample-id\u001b[39m\u001b[38;5;124m\"\u001b[39m, output\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput\u001b[39m\u001b[38;5;124m\"\u001b[39m)]\n\u001b[0;32m----> 7\u001b[0m lineage \u001b[38;5;241m=\u001b[39m \u001b[43mEvaluationLineage\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexample\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mExample\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43minput0\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexpected_output\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexpected_output0\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdata0\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mid\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexample-id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43moutputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mExampleOutput\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrun_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrun-id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexample-id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43moutput\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43mevaluation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mExampleEvaluation\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevaluation_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43meval-id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexample_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexample-id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mDummyEval\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43meval\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mResult\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 10\u001b[0m lineage\n", + "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/intelligence-layer-d3iSWYpm-py3.10/lib/python3.10/site-packages/pydantic/main.py:171\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(self, **data)\u001b[0m\n\u001b[1;32m 169\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m 170\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mValidationError\u001b[0m: 1 validation error for EvaluationLineage\noutputs\n Input should be a valid dictionary or instance of ExampleOutput [type=model_type, input_value=[Example ID=example-id\nRe...run-id\nOutput=\"output\"\n], input_type=list]\n For further information visit https://errors.pydantic.dev/2.6/v/model_type" + ] + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
exampleoutput
0Example ID = example-id\\nInput = input0\\nExpec...Example ID=example-id\\nRelated Run ID=run-id\\n...
\n", + "
" + ], + "text/plain": [ + " example output\n", + "0 Example ID = example-id\\nInput = input0\\nExpec... Example ID=example-id\\nRelated Run ID=run-id\\n..." + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame([lineage.__dict__])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "Repository does not contain a run with id: non-existent", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mintelligence_layer\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mevaluation\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m InMemoryRunRepository\n\u001b[1;32m 2\u001b[0m repo \u001b[38;5;241m=\u001b[39m InMemoryRunRepository()\n\u001b[0;32m----> 3\u001b[0m \u001b[43mrepo\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexample_outputs\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnon-existent\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/aleph/intelligence-layer/src/intelligence_layer/evaluation/run/in_memory_run_repository.py:76\u001b[0m, in \u001b[0;36mInMemoryRunRepository.example_outputs\u001b[0;34m(self, run_id, output_type)\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mexample_outputs\u001b[39m(\n\u001b[1;32m 73\u001b[0m \u001b[38;5;28mself\u001b[39m, run_id: \u001b[38;5;28mstr\u001b[39m, output_type: \u001b[38;5;28mtype\u001b[39m[Output]\n\u001b[1;32m 74\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Iterable[ExampleOutput[Output]]:\n\u001b[1;32m 75\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m run_id \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_run_overviews\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[0;32m---> 76\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRepository does not contain a run with id: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrun_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (\n\u001b[1;32m 79\u001b[0m cast(ExampleOutput[Output], example_output)\n\u001b[1;32m 80\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m example_output \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28msorted\u001b[39m(\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 83\u001b[0m )\n\u001b[1;32m 84\u001b[0m )\n", + "\u001b[0;31mValueError\u001b[0m: Repository does not contain a run with id: non-existent" + ] + } + ], + "source": [ + "from intelligence_layer.evaluation import InMemoryRunRepository\n", + "repo = InMemoryRunRepository()\n", + "repo.example_outputs(\"non-existent\", None)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# BODY_SIZE_LIMIT=\"Infinity\"" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pathlib import Path\n", + "from intelligence_layer.core.tracer.file_tracer import FileTracer\n", + "from intelligence_layer.core.tracer.in_memory_tracer import InMemoryTracer\n", + "tracer = FileTracer(Path(\"temp.jsonl\"))\n", + "#tracer = InMemoryTracer()\n", + "data = [\"this is my data\"* 10] * 10 \n", + "for i in range(100):\n", + " with tracer.span(\"test\") as t:\n", + " t.log(\"message\", data)\n", + "\n", + "tracer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "intelligence-layer-d3iSWYpm-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/intelligence_layer/evaluation/dataset/domain.py b/src/intelligence_layer/evaluation/dataset/domain.py index a61ff0ce0..e91fafd37 100644 --- a/src/intelligence_layer/evaluation/dataset/domain.py +++ b/src/intelligence_layer/evaluation/dataset/domain.py @@ -2,6 +2,7 @@ from uuid import uuid4 from pydantic import BaseModel, Field +from rich.tree import Tree from intelligence_layer.core.task import Input from intelligence_layer.core.tracer.tracer import PydanticSerializable @@ -40,6 +41,12 @@ def __str__(self) -> str: f'Expected output = "{self.expected_output}"\n' ) + def _rich_render(self): + example_tree = Tree(f"Example: {self.id}") + example_tree.add(str(self.input)) + example_tree.add(str(self.expected_output)) + return example_tree + class Dataset(BaseModel): """Represents a dataset linked to multiple examples diff --git a/src/intelligence_layer/evaluation/evaluation/domain.py b/src/intelligence_layer/evaluation/evaluation/domain.py index 6883be626..da0700fb4 100644 --- a/src/intelligence_layer/evaluation/evaluation/domain.py +++ b/src/intelligence_layer/evaluation/evaluation/domain.py @@ -2,6 +2,7 @@ from typing import Generic, Optional, TypeVar from pydantic import BaseModel, SerializeAsAny +from rich.tree import Tree from intelligence_layer.evaluation.run.domain import RunOverview @@ -55,6 +56,13 @@ def __str__(self) -> str: f"Result = {self.result}\n" ) + def _rich_render(self, skip_example_id: bool = False): + tree = Tree(f"Evaluation: {self.evaluation_id}") + if not skip_example_id: + tree.add(str(self.example_id)) + tree.add(str(self.result)) + return tree + class EvaluationOverview(BaseModel, frozen=True): """Overview of the un-aggregated results of evaluating a :class:`Task` on a dataset. diff --git a/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py b/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py index ca3cf7ba5..944d660ea 100644 --- a/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py +++ b/src/intelligence_layer/evaluation/infrastructure/repository_navigator.py @@ -1,7 +1,9 @@ import itertools from typing import Generic, Iterable, Sequence +import rich from pydantic import BaseModel +from rich.tree import Tree from intelligence_layer.core.task import Input, Output from intelligence_layer.evaluation.dataset.dataset_repository import DatasetRepository @@ -21,12 +23,28 @@ class RunLineage(BaseModel, Generic[Input, ExpectedOutput, Output]): example: Example[Input, ExpectedOutput] output: ExampleOutput[Output] + def _ipython_display_(self): + tree = Tree("Run Lineage") + tree.add(self.example._rich_render()) + tree.add(self.output._rich_render(skip_example_id=True)) + rich.print(tree) + class EvaluationLineage(BaseModel, Generic[Input, ExpectedOutput, Output, Evaluation]): example: Example[Input, ExpectedOutput] outputs: Sequence[ExampleOutput[Output]] evaluation: ExampleEvaluation[Evaluation] + def _ipython_display_(self): + tree = Tree("Run Lineage") + tree.add(self.example._rich_render()) + output_tree = Tree("Outputs:") + for output in self.outputs: + output_tree.add(output._rich_render(skip_example_id=True)) + tree.add(output_tree) + tree.add(self.evaluation._rich_render(skip_example_id=True)) + rich.print(tree) + class RepositoryNavigator: """The `RepositoryNavigator` is used to retrieve coupled data from multiple repositories.""" diff --git a/src/intelligence_layer/evaluation/run/domain.py b/src/intelligence_layer/evaluation/run/domain.py index 20a6ef296..cbd1e2dce 100644 --- a/src/intelligence_layer/evaluation/run/domain.py +++ b/src/intelligence_layer/evaluation/run/domain.py @@ -2,6 +2,7 @@ from typing import Generic from pydantic import BaseModel +from rich.tree import Tree from intelligence_layer.core.task import Output @@ -49,6 +50,13 @@ def __str__(self) -> str: f'Output="{self.output}"\n' ) + def _rich_render(self, skip_example_id: bool = False): + tree = Tree(f"Output: {self.run_id}") + if not skip_example_id: + tree.add(str(self.example_id)) + tree.add(str(self.output)) + return tree + class SuccessfulExampleOutput(BaseModel, Generic[Output]): """Successful output of a single evaluated :class:`Example`