From b05d7a808707d469311e1d1cfa194f368fe6ea3d Mon Sep 17 00:00:00 2001 From: steven krawczyk Date: Mon, 11 Sep 2023 18:30:02 -0700 Subject: [PATCH] Fix notebook --- examples/notebooks/FineTuningExperiment.ipynb | 267 ------------------ 1 file changed, 267 deletions(-) diff --git a/examples/notebooks/FineTuningExperiment.ipynb b/examples/notebooks/FineTuningExperiment.ipynb index 74549e9e..181de5ff 100644 --- a/examples/notebooks/FineTuningExperiment.ipynb +++ b/examples/notebooks/FineTuningExperiment.ipynb @@ -2,7 +2,6 @@ "cells": [ { "cell_type": "markdown", -<<<<<<< HEAD "id": "cf18498b", "metadata": {}, "source": [ @@ -21,8 +20,6 @@ }, { "cell_type": "markdown", -======= ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 "id": "623f0cfe", "metadata": {}, "source": [ @@ -31,11 +28,7 @@ }, { "cell_type": "code", -<<<<<<< HEAD "execution_count": null, -======= - "execution_count": 1, ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 "id": "885dabeb", "metadata": {}, "outputs": [], @@ -61,11 +54,7 @@ }, { "cell_type": "code", -<<<<<<< HEAD "execution_count": 1, -======= - "execution_count": 13, ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 "id": "ed4e635e", "metadata": {}, "outputs": [], @@ -73,11 +62,7 @@ "import os\n", "\n", "os.environ[\"DEBUG\"] = \"\" # Set this to \"\" to call OpenAI's API\n", -<<<<<<< HEAD "os.environ[\"OPENAI_API_KEY\"] = \"\" # Insert your key here" -======= - "os.environ[\"OPENAI_API_KEY\"] = \"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\" # Insert your key here" ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 ] }, { @@ -90,25 +75,17 @@ }, { "cell_type": "code", -<<<<<<< HEAD "execution_count": 2, -======= - "execution_count": 14, ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 "id": "beaa70a1", "metadata": {}, "outputs": [], "source": [ "from typing import Dict, List\n", -<<<<<<< HEAD "from datasets import load_dataset\n", -======= ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 "from prompttools.experiment import OpenAIChatExperiment" ] }, { -<<<<<<< HEAD "cell_type": "markdown", "id": "62413948", "metadata": {}, @@ -121,121 +98,25 @@ { "cell_type": "code", "execution_count": 3, -======= - "cell_type": "code", - "execution_count": 1, ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 "id": "9e94cbd6", "metadata": {}, "outputs": [ { -<<<<<<< HEAD "name": "stderr", "output_type": "stream", "text": [ "Found cached dataset wikisql (/Users/stevenkrawczyk/.cache/huggingface/datasets/wikisql/default/0.1.0/7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d)\n" -======= - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b06d6040e5ad4ecda180cdc7844049c7", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading builder script: 0%| | 0.00/2.57k [00:00>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { -<<<<<<< HEAD "model_id": "c5ff2f14b48b44848f2fa1ae4ab3690c", -======= - "model_id": "7226ff6ac9a6408f90d840172214217a", ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 "version_major": 2, "version_minor": 0 }, "text/plain": [ -<<<<<<< HEAD " 0%| | 0/3 [00:00>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 ] }, "metadata": {}, @@ -243,7 +124,6 @@ } ], "source": [ -<<<<<<< HEAD "dataset = load_dataset(\"wikisql\")" ] }, @@ -382,73 +262,34 @@ "## Compare fine tuned model to base model\n", "\n", "Once the fine tuning job finishes, we can compare the fine tuned model to the base model in the same experiment, as seen below." -======= - "from datasets import load_dataset\n", - "\n", - "dataset = load_dataset(\"competition_math\")" ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 ] }, { "cell_type": "code", -<<<<<<< HEAD "execution_count": 4, "id": "9685fc1c", -======= - "execution_count": 12, - "id": "9a9d436f", ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ -<<<<<<< HEAD "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15878/15878 [00:03<00:00, 4170.29it/s]\n" ] -======= - "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7500/7500 [00:00<00:00, 28039.39it/s]\n" - ] - }, - { - "data": { - "text/plain": [ - "{'problem': 'A very large number $x$ is equal to $2^23^34^45^56^67^78^89^9$. What is the smallest positive integer that, when multiplied with $x$, produces a product that is a perfect square?',\n", - " 'level': 'Level 5',\n", - " 'type': 'Number Theory',\n", - " 'solution': \"For the product to be a perfect square, all the exponents need to be even. So we don't need to worry about factors that already have even exponents. We also don't need to worry about $9^9$ because $9$ is already a perfect square. The remaining factors are $3^35^57^7$.\\n\\nTo get even exponents in the product, we need at least one more $3$, at least one more $5$, and at least one more $7$. That would bring us up to $3^45^67^8$, and everything would be good. And indeed, $3\\\\cdot5\\\\cdot7=\\\\boxed{105}$.\"}" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 } ], "source": [ "from tqdm import tqdm\n", "\n", -<<<<<<< HEAD "filtered_test_set = []\n", "\n", "for entry in tqdm(dataset['test']):\n", " if entry['phase'] == 1:\n", " filtered_test_set.append(entry)" -======= - "filtered_set = []\n", - "\n", - "for entry in tqdm(dataset['train']):\n", - " if entry['type'] == 'Number Theory' and entry['level'] == 'Level 5':\n", - " filtered_set.append(entry)\n", - "\n", - "filtered_set[0]" ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 ] }, { "cell_type": "code", -<<<<<<< HEAD "execution_count": 5, "id": "ea08d3e3", "metadata": {}, @@ -462,18 +303,6 @@ " \"valid SQL to answer the user question:\" + \\\n", " str(filtered_test_set[0]['table'])},\n", " {\"role\": \"user\", \"content\": filtered_test_set[0]['question']},\n", -======= - "execution_count": 15, - "id": "a19d8a9f", - "metadata": {}, - "outputs": [], - "source": [ - "models = [\"gpt-4\"] # You can also use a fine-tuned model here, e.g. [\"ft:gpt-3.5-turbo:org_id\"]\n", - "messages = [\n", - " [\n", - " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", - " {\"role\": \"user\", \"content\": filtered_set[0]['problem']},\n", ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 " ]\n", "]\n", "temperatures = [0.0]\n", @@ -484,13 +313,8 @@ }, { "cell_type": "code", -<<<<<<< HEAD "execution_count": 6, "id": "df194340", -======= - "execution_count": 16, - "id": "f86acba0", ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 "metadata": {}, "outputs": [], "source": [ @@ -499,13 +323,8 @@ }, { "cell_type": "code", -<<<<<<< HEAD "execution_count": 7, "id": "c6834b15", -======= - "execution_count": 18, - "id": "06ee8ca1", ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 "metadata": {}, "outputs": [ { @@ -529,11 +348,8 @@ " \n", " \n", " \n", -<<<<<<< HEAD " model\n", " messages\n", -======= ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 " response\n", " latency\n", " \n", @@ -541,7 +357,6 @@ " \n", " \n", " 0\n", -<<<<<<< HEAD " gpt-3.5-turbo\n", " [{'role': 'system', 'content': 'You are a text-to-SQL helper. Given the following table, produce valid SQL to answer the user question:{'header': ['Player', 'No.', 'Nationality', 'Position', 'Years in Toronto', 'School/Club Team'], 'page_title': 'Toronto Raptors all-time roster', 'page_id': '', 'types': ['text', 'text', 'text', 'text', 'text', 'text'], 'id': '1-10015132-16', 'section_title': 'R', 'caption': 'R', 'rows': [['Aleksandar Radojević', '25', 'Serbia', 'Center', '1999-2000', 'Barton CC (KS)'], ['Shawn Respert', '31', 'United States', 'Guard', '1997-98', 'Michigan State'], ['Quentin Richardson', 'N/A', 'United States', 'Forward', '2013-present', 'DePaul'], ['Alvin Robertson', '7, 21', 'United States', 'Guard', '1995-96', 'Arkansas'], ['Carlos Rogers', '33, 34', 'United States', 'Forward-Center', '1995-98', 'Tennessee State'], ['Roy Rogers', '9', 'United States', 'Forward', '1998', 'Alabama'], ['Jalen Rose', '5', 'United States', 'Guard-Forward', '2003-06', 'Michigan'], ['Terrence Ross', '31', 'United States', 'Guard', '2012-present', 'Washington']], 'name': 'table_10015132_16'}'}, {'role': 'user', 'content': 'What is terrence ross' nationality'}]\n", " SELECT Nationality\\nFROM table_10015132_16\\nWHERE Player = 'Terrence Ross'\n", @@ -553,17 +368,12 @@ " [{'role': 'system', 'content': 'You are a text-to-SQL helper. Given the following table, produce valid SQL to answer the user question:{'header': ['Player', 'No.', 'Nationality', 'Position', 'Years in Toronto', 'School/Club Team'], 'page_title': 'Toronto Raptors all-time roster', 'page_id': '', 'types': ['text', 'text', 'text', 'text', 'text', 'text'], 'id': '1-10015132-16', 'section_title': 'R', 'caption': 'R', 'rows': [['Aleksandar Radojević', '25', 'Serbia', 'Center', '1999-2000', 'Barton CC (KS)'], ['Shawn Respert', '31', 'United States', 'Guard', '1997-98', 'Michigan State'], ['Quentin Richardson', 'N/A', 'United States', 'Forward', '2013-present', 'DePaul'], ['Alvin Robertson', '7, 21', 'United States', 'Guard', '1995-96', 'Arkansas'], ['Carlos Rogers', '33, 34', 'United States', 'Forward-Center', '1995-98', 'Tennessee State'], ['Roy Rogers', '9', 'United States', 'Forward', '1998', 'Alabama'], ['Jalen Rose', '5', 'United States', 'Guard-Forward', '2003-06', 'Michigan'], ['Terrence Ross', '31', 'United States', 'Guard', '2012-present', 'Washington']], 'name': 'table_10015132_16'}'}, {'role': 'user', 'content': 'What is terrence ross' nationality'}]\n", " SELECT Nationality FROM table_10015132_16 WHERE Player = Terrence Ross\n", " 1.149571\n", -======= - " The prime factorization of $x$ is $2^{23+2*4+3*6+4*8} * 3^{3+2*5+3*7} * 5^{5+2*6} * 7^{7+2*8} * 11^{9}$.\\n\\nThis simplifies to $2^{60} * 3^{31} * 5^{17} * 7^{23} * 11^{9}$.\\n\\nIn order for a number to be a perfect square, all of its prime factors must be raised to even powers. Therefore, we need to multiply $x$ by a number that will make all of the exponents in its prime factorization even.\\n\\nThe smallest such number is $2 * 3 * 5 * 7 * 11^1 = 2 * 3 * 5 * 7 * 11 = 2310$.\\n\\nTherefore, the smallest positive integer that, when multiplied with $x$, produces a product that is a perfect square is $\\boxed{2310}$.\n", - " 18.24321\n", ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 " \n", " \n", "\n", "" ], "text/plain": [ -<<<<<<< HEAD " model \\\n", "0 gpt-3.5-turbo \n", "1 ft:gpt-3.5-turbo-0613:hegel-ai::7ximyJEn \n", @@ -579,13 +389,6 @@ " latency \n", "0 1.120881 \n", "1 1.149571 " -======= - " response \\\n", - "0 The prime factorization of $x$ is $2^{23+2*4+3*6+4*8} * 3^{3+2*5+3*7} * 5^{5+2*6} * 7^{7+2*8} * 11^{9}$.\\n\\nThis simplifies to $2^{60} * 3^{31} * 5^{17} * 7^{23} * 11^{9}$.\\n\\nIn order for a number to be a perfect square, all of its prime factors must be raised to even powers. Therefore, we need to multiply $x$ by a number that will make all of the exponents in its prime factorization even.\\n\\nThe smallest such number is $2 * 3 * 5 * 7 * 11^1 = 2 * 3 * 5 * 7 * 11 = 2310$.\\n\\nTherefore, the smallest positive integer that, when multiplied with $x$, produces a product that is a perfect square is $\\boxed{2310}$. \n", - "\n", - " latency \n", - "0 18.24321 " ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 ] }, "metadata": {}, @@ -597,7 +400,6 @@ ] }, { -<<<<<<< HEAD "cell_type": "code", "execution_count": 8, "id": "f8f9d237", @@ -686,18 +488,10 @@ "metadata": {}, "source": [ "For our example, both the base model and fine tuned model do a good job at returning only SQL, but the tuned model is marginally closer to the correct answer and has better syntax." -======= - "cell_type": "markdown", - "id": "89cd320e", - "metadata": {}, - "source": [ - "## Auto-Evaluate the model response" ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 ] }, { "cell_type": "markdown", -<<<<<<< HEAD "id": "50edd181", "metadata": {}, "source": [ @@ -714,17 +508,10 @@ "outputs": [], "source": [ "filtered_tuning_set = [entry for entry in tuning_set if len(str(entry['table'])) < 2000]" -======= - "id": "c36767bd", - "metadata": {}, - "source": [ - "To evaluate the model response, we can define an eval method that passes the input and response into another LLM to get feedback." ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 ] }, { "cell_type": "code", -<<<<<<< HEAD "execution_count": null, "id": "4eadc98f", "metadata": {}, @@ -753,37 +540,21 @@ "for row in replicate_fine_tuning_rows:\n", " with open('replicate_fine_tuning_rows.jsonl', 'a') as f:\n", " f.write(json.dumps(row) + '\\n')" -======= - "execution_count": 19, - "id": "8fe03c31", - "metadata": {}, - "outputs": [], - "source": [ - "from prompttools.utils import autoeval_from_expected" ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 ] }, { "cell_type": "markdown", -<<<<<<< HEAD "id": "7425c7fd", "metadata": {}, "source": [ "## Run fine tuning\n", "\n", "For replicate, you will need to upload your fine-tuning rows to an accessible URL. For our example, we uploaded the file to an S3 bucket and used a signed URL to provide access." -======= - "id": "3008b09e", - "metadata": {}, - "source": [ - "Finally, we can evaluate and visualize the results." ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 ] }, { "cell_type": "code", "execution_count": 23, -<<<<<<< HEAD "id": "89554c1e", "metadata": {}, "outputs": [], @@ -1057,50 +828,12 @@ "# Conclusion\n", "\n", "GPT-3.5 does better at producing SQL, especially when fine tuned. One advantage is the larger context window, which allows us to tune on larger SQL tables and provide more context at query time. Another advantage is the power of the underlying model, which is already quite good at the task." -======= - "id": "4633f802", - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'prompt'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/pandas/core/indexes/base.py:3653\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3652\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3653\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3654\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", - "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/pandas/_libs/index.pyx:147\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/pandas/_libs/index.pyx:171\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/pandas/_libs/index.pyx:214\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine._get_loc_duplicates\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/pandas/_libs/index.pyx:222\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine._maybe_get_bool_indexer\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/pandas/_libs/index.pyx:114\u001b[0m, in \u001b[0;36mpandas._libs.index._unpack_bool_indexer\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: 'prompt'", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[23], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mexperiment\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mevaluate\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcorrectness\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mautoeval_from_expected\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautoeval_from_expected_response\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mprompt_column_name\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mprompt\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexpected\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfiltered_set\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msolution\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m experiment\u001b[38;5;241m.\u001b[39mvisualize()\n", - "File \u001b[0;32m~/Development/prompttools/prompttools/experiment/experiments/experiment.py:344\u001b[0m, in \u001b[0;36mExperiment.evaluate\u001b[0;34m(self, metric_name, eval_fn, static_eval_fn_kwargs, image_experiment, **eval_fn_kwargs)\u001b[0m\n\u001b[1;32m 342\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m eval_fn_kwargs\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m 343\u001b[0m curr_kwargs[k] \u001b[38;5;241m=\u001b[39m v[i]\n\u001b[0;32m--> 344\u001b[0m res\u001b[38;5;241m.\u001b[39mappend(\u001b[43meval_fn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrow\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcurr_kwargs\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 345\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_score(metric_name, res)\n", - "File \u001b[0;32m~/Development/prompttools/prompttools/utils/autoeval_from_expected.py:66\u001b[0m, in \u001b[0;36mautoeval_from_expected_response\u001b[0;34m(row, expected, prompt_column_name, response_column_name)\u001b[0m\n\u001b[1;32m 63\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mautoeval_from_expected_response\u001b[39m(\n\u001b[1;32m 64\u001b[0m row: pandas\u001b[38;5;241m.\u001b[39mcore\u001b[38;5;241m.\u001b[39mseries\u001b[38;5;241m.\u001b[39mSeries, expected: \u001b[38;5;28mstr\u001b[39m, prompt_column_name: \u001b[38;5;28mstr\u001b[39m, response_column_name: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mresponse\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 65\u001b[0m ):\n\u001b[0;32m---> 66\u001b[0m prompt \u001b[38;5;241m=\u001b[39m \u001b[43mrow\u001b[49m\u001b[43m[\u001b[49m\u001b[43mprompt_column_name\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 67\u001b[0m response \u001b[38;5;241m=\u001b[39m row[response_column_name]\n\u001b[1;32m 68\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m compute(prompt, expected, response)\n", - "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/pandas/core/series.py:1007\u001b[0m, in \u001b[0;36mSeries.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1004\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[key]\n\u001b[1;32m 1006\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m key_is_scalar:\n\u001b[0;32m-> 1007\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_value\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1009\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_hashable(key):\n\u001b[1;32m 1010\u001b[0m \u001b[38;5;66;03m# Otherwise index.get_value will raise InvalidIndexError\u001b[39;00m\n\u001b[1;32m 1011\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1012\u001b[0m \u001b[38;5;66;03m# For labels that don't resolve as scalars like tuples and frozensets\u001b[39;00m\n", - "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/pandas/core/series.py:1116\u001b[0m, in \u001b[0;36mSeries._get_value\u001b[0;34m(self, label, takeable)\u001b[0m\n\u001b[1;32m 1113\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[label]\n\u001b[1;32m 1115\u001b[0m \u001b[38;5;66;03m# Similar to Index.get_value, but we do not fall back to positional\u001b[39;00m\n\u001b[0;32m-> 1116\u001b[0m loc \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabel\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1118\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(loc):\n\u001b[1;32m 1119\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[loc]\n", - "File \u001b[0;32m/usr/local/lib/python3.11/site-packages/pandas/core/indexes/base.py:3655\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3653\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine\u001b[38;5;241m.\u001b[39mget_loc(casted_key)\n\u001b[1;32m 3654\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[0;32m-> 3655\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3656\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3657\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3658\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3659\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3660\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", - "\u001b[0;31mKeyError\u001b[0m: 'prompt'" - ] - } - ], - "source": [ - "experiment.evaluate(\"correctness\", autoeval_from_expected.autoeval_from_expected_response, {\"prompt_column_name\": \"prompt\"}, expected=filtered_set[0]['solution'])\n", - "experiment.visualize()" ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 ] }, { "cell_type": "code", "execution_count": null, -<<<<<<< HEAD "id": "587efa63", -======= - "id": "c7aaeff9", ->>>>>>> 4be34f31172194b480846c309650b1af009ccbf1 "metadata": {}, "outputs": [], "source": []