From b05d7a808707d469311e1d1cfa194f368fe6ea3d Mon Sep 17 00:00:00 2001
From: steven krawczyk <stevent.krawczyk@gmail.com>
Date: Mon, 11 Sep 2023 18:30:02 -0700
Subject: [PATCH] Fix notebook

---
 examples/notebooks/FineTuningExperiment.ipynb | 267 ------------------
 1 file changed, 267 deletions(-)

diff --git a/examples/notebooks/FineTuningExperiment.ipynb b/examples/notebooks/FineTuningExperiment.ipynb
index 74549e9e..181de5ff 100644
--- a/examples/notebooks/FineTuningExperiment.ipynb
+++ b/examples/notebooks/FineTuningExperiment.ipynb
@@ -2,7 +2,6 @@
  "cells": [
   {
    "cell_type": "markdown",
-<<<<<<< HEAD
    "id": "cf18498b",
    "metadata": {},
    "source": [
@@ -21,8 +20,6 @@
   },
   {
    "cell_type": "markdown",
-=======
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
    "id": "623f0cfe",
    "metadata": {},
    "source": [
@@ -31,11 +28,7 @@
   },
   {
    "cell_type": "code",
-<<<<<<< HEAD
    "execution_count": null,
-=======
-   "execution_count": 1,
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
    "id": "885dabeb",
    "metadata": {},
    "outputs": [],
@@ -61,11 +54,7 @@
   },
   {
    "cell_type": "code",
-<<<<<<< HEAD
    "execution_count": 1,
-=======
-   "execution_count": 13,
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
    "id": "ed4e635e",
    "metadata": {},
    "outputs": [],
@@ -73,11 +62,7 @@
     "import os\n",
     "\n",
     "os.environ[\"DEBUG\"] = \"\"  # Set this to \"\" to call OpenAI's API\n",
-<<<<<<< HEAD
     "os.environ[\"OPENAI_API_KEY\"] = \"\"  # Insert your key here"
-=======
-    "os.environ[\"OPENAI_API_KEY\"] = \"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\"  # Insert your key here"
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
    ]
   },
   {
@@ -90,25 +75,17 @@
   },
   {
    "cell_type": "code",
-<<<<<<< HEAD
    "execution_count": 2,
-=======
-   "execution_count": 14,
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
    "id": "beaa70a1",
    "metadata": {},
    "outputs": [],
    "source": [
     "from typing import Dict, List\n",
-<<<<<<< HEAD
     "from datasets import load_dataset\n",
-=======
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
     "from prompttools.experiment import OpenAIChatExperiment"
    ]
   },
   {
-<<<<<<< HEAD
    "cell_type": "markdown",
    "id": "62413948",
    "metadata": {},
@@ -121,121 +98,25 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-=======
-   "cell_type": "code",
-   "execution_count": 1,
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
    "id": "9e94cbd6",
    "metadata": {},
    "outputs": [
     {
-<<<<<<< HEAD
      "name": "stderr",
      "output_type": "stream",
      "text": [
       "Found cached dataset wikisql (/Users/stevenkrawczyk/.cache/huggingface/datasets/wikisql/default/0.1.0/7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d)\n"
-=======
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b06d6040e5ad4ecda180cdc7844049c7",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading builder script:   0%|          | 0.00/2.57k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "52a5ad22687844e2b575cc8ebc3f6187",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading readme:   0%|          | 0.00/5.32k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Downloading and preparing dataset competition_math/default to /Users/stevenkrawczyk/.cache/huggingface/datasets/competition_math/default/1.0.0/52c6a268ae72ef772498d27551a3f682dac50cd8befddd0326d758cb6908b5f0...\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "8c25859e6f5a4ed8aa5a5b5569b6eb10",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading data:   0%|          | 0.00/7.91M [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating train split:   0%|          | 0/7500 [00:00<?, ? examples/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating test split:   0%|          | 0/5000 [00:00<?, ? examples/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset competition_math downloaded and prepared to /Users/stevenkrawczyk/.cache/huggingface/datasets/competition_math/default/1.0.0/52c6a268ae72ef772498d27551a3f682dac50cd8befddd0326d758cb6908b5f0. Subsequent calls will reuse this data.\n"
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-<<<<<<< HEAD
        "model_id": "c5ff2f14b48b44848f2fa1ae4ab3690c",
-=======
-       "model_id": "7226ff6ac9a6408f90d840172214217a",
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-<<<<<<< HEAD
        "  0%|          | 0/3 [00:00<?, ?it/s]"
-=======
-       "  0%|          | 0/2 [00:00<?, ?it/s]"
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
       ]
      },
      "metadata": {},
@@ -243,7 +124,6 @@
     }
    ],
    "source": [
-<<<<<<< HEAD
     "dataset = load_dataset(\"wikisql\")"
    ]
   },
@@ -382,73 +262,34 @@
     "## Compare fine tuned model to base model\n",
     "\n",
     "Once the fine tuning job finishes, we can compare the fine tuned model to the base model in the same experiment, as seen below."
-=======
-    "from datasets import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"competition_math\")"
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
    ]
   },
   {
    "cell_type": "code",
-<<<<<<< HEAD
    "execution_count": 4,
    "id": "9685fc1c",
-=======
-   "execution_count": 12,
-   "id": "9a9d436f",
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-<<<<<<< HEAD
       "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15878/15878 [00:03<00:00, 4170.29it/s]\n"
      ]
-=======
-      "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7500/7500 [00:00<00:00, 28039.39it/s]\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "{'problem': 'A very large number $x$ is equal to $2^23^34^45^56^67^78^89^9$. What is the smallest positive integer that, when multiplied with $x$, produces a product that is a perfect square?',\n",
-       " 'level': 'Level 5',\n",
-       " 'type': 'Number Theory',\n",
-       " 'solution': \"For the product to be a perfect square, all the exponents need to be even. So we don't need to worry about factors that already have even exponents. We also don't need to worry about $9^9$ because $9$ is already a perfect square. The remaining factors are $3^35^57^7$.\\n\\nTo get even exponents in the product, we need at least one more $3$, at least one more $5$, and at least one more $7$. That would bring us up to $3^45^67^8$, and everything would be good. And indeed, $3\\\\cdot5\\\\cdot7=\\\\boxed{105}$.\"}"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
     }
    ],
    "source": [
     "from tqdm import tqdm\n",
     "\n",
-<<<<<<< HEAD
     "filtered_test_set = []\n",
     "\n",
     "for entry in tqdm(dataset['test']):\n",
     "    if entry['phase'] == 1:\n",
     "        filtered_test_set.append(entry)"
-=======
-    "filtered_set = []\n",
-    "\n",
-    "for entry in tqdm(dataset['train']):\n",
-    "    if entry['type'] == 'Number Theory' and entry['level'] == 'Level 5':\n",
-    "        filtered_set.append(entry)\n",
-    "\n",
-    "filtered_set[0]"
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
    ]
   },
   {
    "cell_type": "code",
-<<<<<<< HEAD
    "execution_count": 5,
    "id": "ea08d3e3",
    "metadata": {},
@@ -462,18 +303,6 @@
     "                                      \"valid SQL to answer the user question:\" + \\\n",
     "                                      str(filtered_test_set[0]['table'])},\n",
     "        {\"role\": \"user\", \"content\": filtered_test_set[0]['question']},\n",
-=======
-   "execution_count": 15,
-   "id": "a19d8a9f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "models = [\"gpt-4\"]  # You can also use a fine-tuned model here, e.g. [\"ft:gpt-3.5-turbo:org_id\"]\n",
-    "messages = [\n",
-    "    [\n",
-    "        {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
-    "        {\"role\": \"user\", \"content\": filtered_set[0]['problem']},\n",
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
     "    ]\n",
     "]\n",
     "temperatures = [0.0]\n",
@@ -484,13 +313,8 @@
   },
   {
    "cell_type": "code",
-<<<<<<< HEAD
    "execution_count": 6,
    "id": "df194340",
-=======
-   "execution_count": 16,
-   "id": "f86acba0",
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
    "metadata": {},
    "outputs": [],
    "source": [
@@ -499,13 +323,8 @@
   },
   {
    "cell_type": "code",
-<<<<<<< HEAD
    "execution_count": 7,
    "id": "c6834b15",
-=======
-   "execution_count": 18,
-   "id": "06ee8ca1",
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
    "metadata": {},
    "outputs": [
     {
@@ -529,11 +348,8 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-<<<<<<< HEAD
        "      <th>model</th>\n",
        "      <th>messages</th>\n",
-=======
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
        "      <th>response</th>\n",
        "      <th>latency</th>\n",
        "    </tr>\n",
@@ -541,7 +357,6 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-<<<<<<< HEAD
        "      <td>gpt-3.5-turbo</td>\n",
        "      <td>[{'role': 'system', 'content': 'You are a text-to-SQL helper. Given the following table, produce valid SQL to answer the user question:{'header': ['Player', 'No.', 'Nationality', 'Position', 'Years in Toronto', 'School/Club Team'], 'page_title': 'Toronto Raptors all-time roster', 'page_id': '', 'types': ['text', 'text', 'text', 'text', 'text', 'text'], 'id': '1-10015132-16', 'section_title': 'R', 'caption': 'R', 'rows': [['Aleksandar Radojević', '25', 'Serbia', 'Center', '1999-2000', 'Barton CC (KS)'], ['Shawn Respert', '31', 'United States', 'Guard', '1997-98', 'Michigan State'], ['Quentin Richardson', 'N/A', 'United States', 'Forward', '2013-present', 'DePaul'], ['Alvin Robertson', '7, 21', 'United States', 'Guard', '1995-96', 'Arkansas'], ['Carlos Rogers', '33, 34', 'United States', 'Forward-Center', '1995-98', 'Tennessee State'], ['Roy Rogers', '9', 'United States', 'Forward', '1998', 'Alabama'], ['Jalen Rose', '5', 'United States', 'Guard-Forward', '2003-06', 'Michigan'], ['Terrence Ross', '31', 'United States', 'Guard', '2012-present', 'Washington']], 'name': 'table_10015132_16'}'}, {'role': 'user', 'content': 'What is terrence ross' nationality'}]</td>\n",
        "      <td>SELECT Nationality\\nFROM table_10015132_16\\nWHERE Player = 'Terrence Ross'</td>\n",
@@ -553,17 +368,12 @@
        "      <td>[{'role': 'system', 'content': 'You are a text-to-SQL helper. Given the following table, produce valid SQL to answer the user question:{'header': ['Player', 'No.', 'Nationality', 'Position', 'Years in Toronto', 'School/Club Team'], 'page_title': 'Toronto Raptors all-time roster', 'page_id': '', 'types': ['text', 'text', 'text', 'text', 'text', 'text'], 'id': '1-10015132-16', 'section_title': 'R', 'caption': 'R', 'rows': [['Aleksandar Radojević', '25', 'Serbia', 'Center', '1999-2000', 'Barton CC (KS)'], ['Shawn Respert', '31', 'United States', 'Guard', '1997-98', 'Michigan State'], ['Quentin Richardson', 'N/A', 'United States', 'Forward', '2013-present', 'DePaul'], ['Alvin Robertson', '7, 21', 'United States', 'Guard', '1995-96', 'Arkansas'], ['Carlos Rogers', '33, 34', 'United States', 'Forward-Center', '1995-98', 'Tennessee State'], ['Roy Rogers', '9', 'United States', 'Forward', '1998', 'Alabama'], ['Jalen Rose', '5', 'United States', 'Guard-Forward', '2003-06', 'Michigan'], ['Terrence Ross', '31', 'United States', 'Guard', '2012-present', 'Washington']], 'name': 'table_10015132_16'}'}, {'role': 'user', 'content': 'What is terrence ross' nationality'}]</td>\n",
        "      <td>SELECT Nationality FROM table_10015132_16 WHERE Player = Terrence Ross</td>\n",
        "      <td>1.149571</td>\n",
-=======
-       "      <td>The prime factorization of $x$ is $2^{23+2*4+3*6+4*8} * 3^{3+2*5+3*7} * 5^{5+2*6} * 7^{7+2*8} * 11^{9}$.\\n\\nThis simplifies to $2^{60} * 3^{31} * 5^{17} * 7^{23} * 11^{9}$.\\n\\nIn order for a number to be a perfect square, all of its prime factors must be raised to even powers. Therefore, we need to multiply $x$ by a number that will make all of the exponents in its prime factorization even.\\n\\nThe smallest such number is $2 * 3 * 5 * 7 * 11^1 = 2 * 3 * 5 * 7 * 11 = 2310$.\\n\\nTherefore, the smallest positive integer that, when multiplied with $x$, produces a product that is a perfect square is $\\boxed{2310}$.</td>\n",
-       "      <td>18.24321</td>\n",
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-<<<<<<< HEAD
        "                                      model  \\\n",
        "0  gpt-3.5-turbo                              \n",
        "1  ft:gpt-3.5-turbo-0613:hegel-ai::7ximyJEn   \n",
@@ -579,13 +389,6 @@
        "    latency  \n",
        "0  1.120881  \n",
        "1  1.149571  "
-=======
-       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  response  \\\n",
-       "0  The prime factorization of $x$ is $2^{23+2*4+3*6+4*8} * 3^{3+2*5+3*7} * 5^{5+2*6} * 7^{7+2*8} * 11^{9}$.\\n\\nThis simplifies to $2^{60} * 3^{31} * 5^{17} * 7^{23} * 11^{9}$.\\n\\nIn order for a number to be a perfect square, all of its prime factors must be raised to even powers. Therefore, we need to multiply $x$ by a number that will make all of the exponents in its prime factorization even.\\n\\nThe smallest such number is $2 * 3 * 5 * 7 * 11^1 = 2 * 3 * 5 * 7 * 11 = 2310$.\\n\\nTherefore, the smallest positive integer that, when multiplied with $x$, produces a product that is a perfect square is $\\boxed{2310}$.   \n",
-       "\n",
-       "    latency  \n",
-       "0  18.24321  "
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
       ]
      },
      "metadata": {},
@@ -597,7 +400,6 @@
    ]
   },
   {
-<<<<<<< HEAD
    "cell_type": "code",
    "execution_count": 8,
    "id": "f8f9d237",
@@ -686,18 +488,10 @@
    "metadata": {},
    "source": [
     "For our example, both the base model and fine tuned model do a good job at returning only SQL, but the tuned model is marginally closer to the correct answer and has better syntax."
-=======
-   "cell_type": "markdown",
-   "id": "89cd320e",
-   "metadata": {},
-   "source": [
-    "## Auto-Evaluate the model response"
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
    ]
   },
   {
    "cell_type": "markdown",
-<<<<<<< HEAD
    "id": "50edd181",
    "metadata": {},
    "source": [
@@ -714,17 +508,10 @@
    "outputs": [],
    "source": [
     "filtered_tuning_set = [entry for entry in tuning_set if len(str(entry['table'])) < 2000]"
-=======
-   "id": "c36767bd",
-   "metadata": {},
-   "source": [
-    "To evaluate the model response, we can define an eval method that passes the input and response into another LLM to get feedback."
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
    ]
   },
   {
    "cell_type": "code",
-<<<<<<< HEAD
    "execution_count": null,
    "id": "4eadc98f",
    "metadata": {},
@@ -753,37 +540,21 @@
     "for row in replicate_fine_tuning_rows:\n",
     "    with open('replicate_fine_tuning_rows.jsonl', 'a') as f:\n",
     "        f.write(json.dumps(row) + '\\n')"
-=======
-   "execution_count": 19,
-   "id": "8fe03c31",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from prompttools.utils import autoeval_from_expected"
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
    ]
   },
   {
    "cell_type": "markdown",
-<<<<<<< HEAD
    "id": "7425c7fd",
    "metadata": {},
    "source": [
     "## Run fine tuning\n",
     "\n",
     "For replicate, you will need to upload your fine-tuning rows to an accessible URL. For our example, we uploaded the file to an S3 bucket and used a signed URL to provide access."
-=======
-   "id": "3008b09e",
-   "metadata": {},
-   "source": [
-    "Finally, we can evaluate and visualize the results."
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 23,
-<<<<<<< HEAD
    "id": "89554c1e",
    "metadata": {},
    "outputs": [],
@@ -1057,50 +828,12 @@
     "# Conclusion\n",
     "\n",
     "GPT-3.5 does better at producing SQL, especially when fine tuned. One advantage is the larger context window, which allows us to tune on larger SQL tables and provide more context at query time. Another advantage is the power of the underlying model, which is already quite good at the task."
-=======
-   "id": "4633f802",
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "KeyError",
-     "evalue": "'prompt'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/pandas/core/indexes/base.py:3653\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3652\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3653\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3654\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
-      "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/pandas/_libs/index.pyx:147\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
-      "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/pandas/_libs/index.pyx:171\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
-      "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/pandas/_libs/index.pyx:214\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine._get_loc_duplicates\u001b[0;34m()\u001b[0m\n",
-      "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/pandas/_libs/index.pyx:222\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine._maybe_get_bool_indexer\u001b[0;34m()\u001b[0m\n",
-      "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/pandas/_libs/index.pyx:114\u001b[0m, in \u001b[0;36mpandas._libs.index._unpack_bool_indexer\u001b[0;34m()\u001b[0m\n",
-      "\u001b[0;31mKeyError\u001b[0m: 'prompt'",
-      "\nThe above exception was the direct cause of the following exception:\n",
-      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[23], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mexperiment\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mevaluate\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcorrectness\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mautoeval_from_expected\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautoeval_from_expected_response\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mprompt_column_name\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mprompt\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexpected\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfiltered_set\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msolution\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      2\u001b[0m experiment\u001b[38;5;241m.\u001b[39mvisualize()\n",
-      "File \u001b[0;32m~/Development/prompttools/prompttools/experiment/experiments/experiment.py:344\u001b[0m, in \u001b[0;36mExperiment.evaluate\u001b[0;34m(self, metric_name, eval_fn, static_eval_fn_kwargs, image_experiment, **eval_fn_kwargs)\u001b[0m\n\u001b[1;32m    342\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m eval_fn_kwargs\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m    343\u001b[0m         curr_kwargs[k] \u001b[38;5;241m=\u001b[39m v[i]\n\u001b[0;32m--> 344\u001b[0m     res\u001b[38;5;241m.\u001b[39mappend(\u001b[43meval_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcurr_kwargs\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m    345\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_score(metric_name, res)\n",
-      "File \u001b[0;32m~/Development/prompttools/prompttools/utils/autoeval_from_expected.py:66\u001b[0m, in \u001b[0;36mautoeval_from_expected_response\u001b[0;34m(row, expected, prompt_column_name, response_column_name)\u001b[0m\n\u001b[1;32m     63\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mautoeval_from_expected_response\u001b[39m(\n\u001b[1;32m     64\u001b[0m     row: pandas\u001b[38;5;241m.\u001b[39mcore\u001b[38;5;241m.\u001b[39mseries\u001b[38;5;241m.\u001b[39mSeries, expected: \u001b[38;5;28mstr\u001b[39m, prompt_column_name: \u001b[38;5;28mstr\u001b[39m, response_column_name: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mresponse\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     65\u001b[0m ):\n\u001b[0;32m---> 66\u001b[0m     prompt \u001b[38;5;241m=\u001b[39m \u001b[43mrow\u001b[49m\u001b[43m[\u001b[49m\u001b[43mprompt_column_name\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m     67\u001b[0m     response \u001b[38;5;241m=\u001b[39m row[response_column_name]\n\u001b[1;32m     68\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m compute(prompt, expected, response)\n",
-      "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/pandas/core/series.py:1007\u001b[0m, in \u001b[0;36mSeries.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   1004\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[key]\n\u001b[1;32m   1006\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m key_is_scalar:\n\u001b[0;32m-> 1007\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_value\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1009\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_hashable(key):\n\u001b[1;32m   1010\u001b[0m     \u001b[38;5;66;03m# Otherwise index.get_value will raise InvalidIndexError\u001b[39;00m\n\u001b[1;32m   1011\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1012\u001b[0m         \u001b[38;5;66;03m# For labels that don't resolve as scalars like tuples and frozensets\u001b[39;00m\n",
-      "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/pandas/core/series.py:1116\u001b[0m, in \u001b[0;36mSeries._get_value\u001b[0;34m(self, label, takeable)\u001b[0m\n\u001b[1;32m   1113\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[label]\n\u001b[1;32m   1115\u001b[0m \u001b[38;5;66;03m# Similar to Index.get_value, but we do not fall back to positional\u001b[39;00m\n\u001b[0;32m-> 1116\u001b[0m loc \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabel\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1118\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(loc):\n\u001b[1;32m   1119\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[loc]\n",
-      "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/pandas/core/indexes/base.py:3655\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   3653\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine\u001b[38;5;241m.\u001b[39mget_loc(casted_key)\n\u001b[1;32m   3654\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[0;32m-> 3655\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m   3656\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m   3657\u001b[0m     \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m   3658\u001b[0m     \u001b[38;5;66;03m#  InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m   3659\u001b[0m     \u001b[38;5;66;03m#  the TypeError.\u001b[39;00m\n\u001b[1;32m   3660\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
-      "\u001b[0;31mKeyError\u001b[0m: 'prompt'"
-     ]
-    }
-   ],
-   "source": [
-    "experiment.evaluate(\"correctness\", autoeval_from_expected.autoeval_from_expected_response, {\"prompt_column_name\": \"prompt\"}, expected=filtered_set[0]['solution'])\n",
-    "experiment.visualize()"
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-<<<<<<< HEAD
    "id": "587efa63",
-=======
-   "id": "c7aaeff9",
->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1
    "metadata": {},
    "outputs": [],
    "source": []