From 1b8804b027b8b1a339028b49a5d86d391a741f0d Mon Sep 17 00:00:00 2001 From: steven krawczyk Date: Thu, 20 Jul 2023 20:26:01 -0700 Subject: [PATCH 01/10] GPT4 vs Llama experiment --- examples/notebooks/GPT4vsLlama2.ipynb | 320 ++++++++++++++ examples/notebooks/OpenAIChatExperiment.ipynb | 101 ++++- examples/notebooks/Untitled.ipynb | 400 ------------------ .../experiment/experiments/experiment.py | 3 +- .../experiments/llama_cpp_experiment.py | 11 +- .../experiments/openai_chat_experiment.py | 16 +- prompttools/selector/prompt_selector.py | 36 ++ 7 files changed, 473 insertions(+), 414 deletions(-) create mode 100644 examples/notebooks/GPT4vsLlama2.ipynb delete mode 100644 examples/notebooks/Untitled.ipynb create mode 100644 prompttools/selector/prompt_selector.py diff --git a/examples/notebooks/GPT4vsLlama2.ipynb b/examples/notebooks/GPT4vsLlama2.ipynb new file mode 100644 index 00000000..471e97e3 --- /dev/null +++ b/examples/notebooks/GPT4vsLlama2.ipynb @@ -0,0 +1,320 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0a13ddc8", + "metadata": {}, + "source": [ + "# Open Source vc OpenAI" + ] + }, + { + "cell_type": "markdown", + "id": "780dc3bf", + "metadata": {}, + "source": [ + "Did GPT-4 get worse? Is Llama 2 a better model? Run this notebook to find out.\n", + "\n", + "We'll use auto-evaluation by GPT-4 to measure outputs from Llama 2, as well as gpt-4 (current and frozen versions) across a few prompts. To make this example easy to run, we'll be using a 7B GGML variant of the Llama model. This should be able to run on a typical laptop." + ] + }, + { + "cell_type": "markdown", + "id": "623f0cfe", + "metadata": {}, + "source": [ + "## Installations" + ] + }, + { + "cell_type": "markdown", + "id": "52881369", + "metadata": {}, + "source": [ + "You can setup prompttools either by installing via `pip` or using `python setup.py develop` in the root of this repo. Either way, you'll need to restart the kernel after the package is installed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "885dabeb", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install --quiet --force-reinstall prompttools" + ] + }, + { + "cell_type": "markdown", + "id": "2eac35f8", + "metadata": {}, + "source": [ + "## Setup imports and API keys" + ] + }, + { + "cell_type": "markdown", + "id": "5edba05a", + "metadata": {}, + "source": [ + "Next, we'll need to set our API keys. Since we want to use GPT-4 for auto-eval, we need to set that one." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "ed4e635e", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ['DEBUG'] = \"\"\n", + "os.environ['OPENAI_API_KEY'] = \"\"" + ] + }, + { + "cell_type": "markdown", + "id": "842f1e47", + "metadata": {}, + "source": [ + "Then we'll import the relevant `prompttools` modules to setup our experiment." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "beaa70a1", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Dict, List, Tuple\n", + "from prompttools.experiment import LlamaCppExperiment\n", + "from prompttools.experiment import OpenAIChatExperiment\n", + "from prompttools.harness.multi_experiment_harness import MultiExperimentHarness\n", + "from prompttools.selector.prompt_selector import PromptSelector" + ] + }, + { + "cell_type": "markdown", + "id": "622dea9a", + "metadata": {}, + "source": [ + "## Run an experiment" + ] + }, + { + "cell_type": "markdown", + "id": "0cd0bae8", + "metadata": {}, + "source": [ + "To set up this experiment, we need to use a `PromptSelector`. This is because the input formats for Llama 2 and GPT-4 are different. While GPT-4 is run with a chat history, Llama2 takes text input. A `PromptSelector` allows us to pass the same prompt to different models, and render the necessary object at request time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2788d49f", + "metadata": {}, + "outputs": [], + "source": [ + "instructions = [\"\"\"\n", + "You are a sales development representative for a startup called Hegel AI.\n", + "Your startup builds developer tools for large language models.\n", + "\"\"\",\n", + "\"\"\"\n", + "You are a customer support representative for a startup called Hegel AI.\n", + "Answer the following customer question:\n", + "\"\"\", \n", + "\"\"\"\n", + "You are a helpful math tutor.\n", + "Answer the following math problem:\n", + "\"\"\"]\n", + "inputs = [\"\"\"\n", + "Draft a short sales email, 50 words or less, asking a prospect for 15 minutes\n", + "of their time to chat about how they're using large language models.\n", + "\"\"\",\n", + "\"\"\"\n", + "Do you offer refunds?\n", + "\"\"\",\n", + "\"\"\"\n", + "Is 7 a prime number?\n", + "\"\"\"]\n", + "selectors = [PromptSelector(instructions[i], inputs[i]) for i in range(3)]" + ] + }, + { + "cell_type": "markdown", + "id": "3babfe5a", + "metadata": {}, + "source": [ + "Next, we create our test inputs. We can iterate over models, inputs, and configurations like temperature." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9114cfbf", + "metadata": {}, + "outputs": [], + "source": [ + "model_paths = ['/Users/stevenkrawczyk/Downloads/llama-2-7b-chat.ggmlv3.q2_K.bin'] # Download from https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/tree/main\n", + "temperatures = [1.0]\n", + "call_params = dict(temperature=temperatures)\n", + "llama_experiment = LlamaCppExperiment(model_paths, selectors, call_params=call_params)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8fe83830", + "metadata": {}, + "outputs": [], + "source": [ + "models = ['gpt-4-0314', 'gpt-4-0613', 'gpt-4']\n", + "temperatures = [0.0]\n", + "openai_experiment = OpenAIChatExperiment(models, selectors, temperature=temperatures)" + ] + }, + { + "cell_type": "markdown", + "id": "6c3162e6", + "metadata": {}, + "source": [ + "After that - we define our harness to run experiments" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9147649a", + "metadata": {}, + "outputs": [], + "source": [ + "harness = MultiExperimentHarness([openai_experiment, llama_experiment])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f22ebd7", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "harness.prepare()\n", + "harness.run()" + ] + }, + { + "cell_type": "markdown", + "id": "2ceb662a", + "metadata": {}, + "source": [ + "Finally, we define an evaluation function that can be used to evaluate outputs across different models. Notice that the extract resp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ddbb951", + "metadata": {}, + "outputs": [], + "source": [ + "from prompttools.utils import autoeval\n", + "\n", + "\n", + "def extract_responses(output) -> str:\n", + " if \"text\" in output[\"choices\"][0]:\n", + " return [choice[\"text\"] for choice in output[\"choices\"]]\n", + " else:\n", + " return [choice[\"message\"][\"content\"] for choice in output[\"choices\"]]\n", + "\n", + "\n", + "def use_gpt4(\n", + " prompt: str, results: Dict, metadata: Dict\n", + ") -> float:\n", + " \"\"\"\n", + " A simple test that checks semantic similarity between the user input\n", + " and the model's text responses.\n", + " \"\"\"\n", + " distances = [\n", + " autoeval.compute(prompt, response)\n", + " for response in extract_responses(results)\n", + " ]\n", + " return min(distances)\n" + ] + }, + { + "cell_type": "markdown", + "id": "974d6065", + "metadata": {}, + "source": [ + "Finally, we can evaluate and visualize the results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e80dfeec", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "harness.evaluate(\"auto-evaluation\", use_gpt4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d09c18e", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "harness.visualize()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ee8dc43", + "metadata": {}, + "outputs": [], + "source": [ + "harness.visualize(\"response(s)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c90958d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/notebooks/OpenAIChatExperiment.ipynb b/examples/notebooks/OpenAIChatExperiment.ipynb index 9733b17a..14dac017 100644 --- a/examples/notebooks/OpenAIChatExperiment.ipynb +++ b/examples/notebooks/OpenAIChatExperiment.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "0a13ddc8", "metadata": {}, @@ -9,6 +10,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "623f0cfe", "metadata": {}, @@ -18,15 +20,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "885dabeb", "metadata": {}, "outputs": [], "source": [ - "!pip install --quiet --force-reinstall prompttools" + "# !pip install --quiet --force-reinstall prompttools" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "2eac35f8", "metadata": {}, @@ -35,6 +38,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "5edba05a", "metadata": {}, @@ -44,18 +48,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "ed4e635e", "metadata": {}, "outputs": [], "source": [ "import os\n", - "os.environ['DEBUG']=\"1\"\n", + "os.environ['DEBUG']=\"\"\n", "os.environ['HEGELAI_API_KEY'] = \"\" # Optional, it will be needed to use with `HegelScribe` to persist/visualize your experiments\n", "os.environ['OPENAI_API_KEY'] = \"\"" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "842f1e47", "metadata": {}, @@ -65,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "beaa70a1", "metadata": {}, "outputs": [], @@ -75,6 +80,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "622dea9a", "metadata": {}, @@ -83,6 +89,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "3babfe5a", "metadata": {}, @@ -92,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "9114cfbf", "metadata": {}, "outputs": [], @@ -108,6 +115,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "f3fa5450", "metadata": {}, @@ -117,15 +125,90 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "83b33130", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Exception in thread Thread-5 (_process_queue):\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/Cellar/python@3.11/3.11.4/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py\", line 1038, in _bootstrap_inner\n", + " self.run()\n", + " File \"/usr/local/Cellar/python@3.11/3.11.4/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py\", line 975, in run\n", + " self._target(*self._args, **self._kwargs)\n", + " File \"/Users/stevenkrawczyk/Development/prompttools/prompttools/requests/request_queue.py\", line 35, in _process_queue\n", + " self._do_task(fn, args)\n", + " File \"/Users/stevenkrawczyk/Development/prompttools/prompttools/requests/request_queue.py\", line 42, in _do_task\n", + " res = self._run(fn, args)\n", + " ^^^^^^^^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/site-packages/tenacity/__init__.py\", line 289, in wrapped_f\n", + " return self(f, *args, **kw)\n", + " ^^^^^^^^^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/site-packages/tenacity/__init__.py\", line 379, in __call__\n", + " do = self.iter(retry_state=retry_state)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/site-packages/tenacity/__init__.py\", line 314, in iter\n", + " return fut.result()\n", + " ^^^^^^^^^^^^\n", + " File \"/usr/local/Cellar/python@3.11/3.11.4/Frameworks/Python.framework/Versions/3.11/lib/python3.11/concurrent/futures/_base.py\", line 449, in result\n", + " return self.__get_result()\n", + " ^^^^^^^^^^^^^^^^^^^\n", + " File \"/usr/local/Cellar/python@3.11/3.11.4/Frameworks/Python.framework/Versions/3.11/lib/python3.11/concurrent/futures/_base.py\", line 401, in __get_result\n", + " raise self._exception\n", + " File \"/usr/local/lib/python3.11/site-packages/tenacity/__init__.py\", line 382, in __call__\n", + " result = fn(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/stevenkrawczyk/Development/prompttools/prompttools/requests/request_queue.py\", line 55, in _run\n", + " result = fn(**args)\n", + " ^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/site-packages/openai/api_resources/chat_completion.py\", line 25, in create\n", + " return super().create(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/site-packages/openai/api_resources/abstract/engine_api_resource.py\", line 153, in create\n", + " response, _, api_key = requestor.request(\n", + " ^^^^^^^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/site-packages/openai/api_requestor.py\", line 230, in request\n", + " resp, got_stream = self._interpret_response(result, stream)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/usr/local/lib/python3.11/site-packages/openai/api_requestor.py\", line 624, in _interpret_response\n", + " self._interpret_response_line(\n", + " File \"/usr/local/lib/python3.11/site-packages/openai/api_requestor.py\", line 687, in _interpret_response_line\n", + " raise self.handle_error_response(\n", + "openai.error.InvalidRequestError: We could not parse the JSON body of your request. (HINT: This likely means you aren't using your HTTP library correctly. The OpenAI API expects a JSON payload, but what was sent was not valid JSON. If you have trouble figuring out how to fix this, please contact us through our help center at help.openai.com.)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'model': 'gpt-3.5-turbo', 'messages': [{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': 'Who was the first president?'}], 'temperature': 0.0, 'top_p': 1.0, 'n': 1, 'stream': False, 'stop': None, 'max_token': inf, 'presence_penalty': 0, 'frequency_penalty': 0, 'logit_bias': None}\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mexperiment\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Development/prompttools/prompttools/experiment/experiments/experiment.py:136\u001b[0m, in \u001b[0;36mExperiment.run\u001b[0;34m(self, runs)\u001b[0m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(runs):\n\u001b[1;32m 132\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mqueue\u001b[38;5;241m.\u001b[39menqueue(\n\u001b[1;32m 133\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompletion_fn,\n\u001b[1;32m 134\u001b[0m combo,\n\u001b[1;32m 135\u001b[0m )\n\u001b[0;32m--> 136\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mresults \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mqueue\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresults\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 137\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscores[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlatency\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mqueue\u001b[38;5;241m.\u001b[39mlatencies()\n\u001b[1;32m 138\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mresults) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n", + "File \u001b[0;32m~/Development/prompttools/prompttools/requests/request_queue.py:81\u001b[0m, in \u001b[0;36mRequestQueue.results\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mresults\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Dict[\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mobject\u001b[39m]]:\n\u001b[1;32m 78\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 79\u001b[0m \u001b[38;5;124;03m Joins the queue and gets results.\u001b[39;00m\n\u001b[1;32m 80\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 81\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdata_queue\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 82\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest_results\n", + "File \u001b[0;32m/usr/local/Cellar/python@3.11/3.11.4/Frameworks/Python.framework/Versions/3.11/lib/python3.11/queue.py:90\u001b[0m, in \u001b[0;36mQueue.join\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mall_tasks_done:\n\u001b[1;32m 89\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39munfinished_tasks:\n\u001b[0;32m---> 90\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mall_tasks_done\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/usr/local/Cellar/python@3.11/3.11.4/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py:320\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[1;32m 319\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 320\u001b[0m \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 321\u001b[0m gotit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 322\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], "source": [ "experiment.run()" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "266c13eb", "metadata": {}, @@ -134,6 +217,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "bebb8023", "metadata": {}, @@ -172,6 +256,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "974d6065", "metadata": {}, diff --git a/examples/notebooks/Untitled.ipynb b/examples/notebooks/Untitled.ipynb deleted file mode 100644 index 515b36e9..00000000 --- a/examples/notebooks/Untitled.ipynb +++ /dev/null @@ -1,400 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "0a13ddc8", - "metadata": {}, - "source": [ - "# Open Source vc OpenAI" - ] - }, - { - "cell_type": "markdown", - "id": "780dc3bf", - "metadata": {}, - "source": [ - "Wondering how much better Llama 2 is compared to Llama?\n", - "\n", - "In this notebook, we'll use auto-evaluation by GPT-4 to measure outputs from both Llama and Llama 2 on a few prompts. To make this example easy to run, we'll be using 7B GGML variants of the Llama models. This should be able to run on a typical laptop." - ] - }, - { - "cell_type": "markdown", - "id": "623f0cfe", - "metadata": {}, - "source": [ - "## Installations" - ] - }, - { - "cell_type": "markdown", - "id": "52881369", - "metadata": {}, - "source": [ - "You can setup prompttools either by installing via `pip` or using `python setup.py develop` in the root of this repo. Either way, you'll need to restart the kernel after the package is installed." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "885dabeb", - "metadata": {}, - "outputs": [], - "source": [ - "# !pip install --quiet --force-reinstall prompttools" - ] - }, - { - "cell_type": "markdown", - "id": "2eac35f8", - "metadata": {}, - "source": [ - "## Setup imports and API keys" - ] - }, - { - "cell_type": "markdown", - "id": "5edba05a", - "metadata": {}, - "source": [ - "Next, we'll need to set our API keys. Since we want to use GPT-4 for auto-eval, we need to set that one. We won't be using the Hegel AI API key for this example." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "ed4e635e", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ['DEBUG'] = \"1\"\n", - "os.environ['HEGELAI_API_KEY'] = \"\"\n", - "os.environ['OPENAI_API_KEY'] = \"\"" - ] - }, - { - "cell_type": "markdown", - "id": "842f1e47", - "metadata": {}, - "source": [ - "Then we'll import the relevant `prompttools` modules to setup our experiment." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "beaa70a1", - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Dict, List, Tuple\n", - "from prompttools.experiment import LlamaCppExperiment\n", - "from prompttools.experiment import OpenAIChatExperiment\n", - "from prompttools.harness.multi_experiment_harness import MultiExperimentHarness" - ] - }, - { - "cell_type": "markdown", - "id": "622dea9a", - "metadata": {}, - "source": [ - "## Run an experiment" - ] - }, - { - "cell_type": "markdown", - "id": "3babfe5a", - "metadata": {}, - "source": [ - "Next, we create our test inputs. We can iterate over models, inputs, and configurations like temperature." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "9114cfbf", - "metadata": {}, - "outputs": [], - "source": [ - "model_paths = ['/Users/stevenkrawczyk/Downloads/llama-2-7b-chat.ggmlv3.q2_K.bin'] # Download from https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/tree/main\n", - "prompts = [\n", - " \"\"\"\n", - " OBJECTIVE:\n", - " You are a sales development representative for a startup called Hegel AI.\n", - " Your startup builds developer tools for large language models.\n", - " Draft a short sales email, 50 words or less, asking a prospect for 15 minutes\n", - " of their time to chat about how they're using large language models.\n", - " \n", - " RESPONSE:\n", - " \"\"\",\n", - " \"\"\"\n", - " OBJECTIVE:\n", - " You are a customer support representative for a startup called Hegel AI.\n", - " Answer the following customer question:\n", - " Do you offer refunds?\n", - " \n", - " RESPONSE:\n", - " \"\"\"\n", - "]\n", - "temperatures = [1.0]\n", - "call_params = dict(temperature=temperatures)\n", - "llama_experiment = LlamaCppExperiment(model_paths, prompts, call_params=call_params)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "575032d6", - "metadata": {}, - "outputs": [], - "source": [ - "models = ['gpt-4-0314', 'gpt-4-0613', 'gpt-4']\n", - "messages = [[\n", - " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", - " {\"role\": \"user\", \"content\": \"Who was the first president?\"},\n", - "]]\n", - "temperatures = [0.0]\n", - "\n", - "openai_experiment = OpenAIChatExperiment(models, messages, temperature=temperatures)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "2ef4f996", - "metadata": {}, - "outputs": [], - "source": [ - "harness = MultiExperimentHarness([openai_experiment, llama_experiment])" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "0f22ebd7", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama.cpp: loading model from /Users/stevenkrawczyk/Downloads/llama-2-7b-chat.ggmlv3.q2_K.bin\n", - "llama_model_load_internal: format = ggjt v3 (latest)\n", - "llama_model_load_internal: n_vocab = 32000\n", - "llama_model_load_internal: n_ctx = 512\n", - "llama_model_load_internal: n_embd = 4096\n", - "llama_model_load_internal: n_mult = 256\n", - "llama_model_load_internal: n_head = 32\n", - "llama_model_load_internal: n_layer = 32\n", - "llama_model_load_internal: n_rot = 128\n", - "llama_model_load_internal: ftype = 10 (mostly Q2_K)\n", - "llama_model_load_internal: n_ff = 11008\n", - "llama_model_load_internal: model size = 7B\n", - "llama_model_load_internal: ggml ctx size = 0.08 MB\n", - "llama_model_load_internal: mem required = 4525.65 MB (+ 1026.00 MB per state)\n", - "llama_new_context_with_model: kv self size = 256.00 MB\n", - "AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | VSX = 0 | \n", - "\n", - "llama_print_timings: load time = 11941.03 ms\n", - "llama_print_timings: sample time = 106.52 ms / 128 runs ( 0.83 ms per token, 1201.62 tokens per second)\n", - "llama_print_timings: prompt eval time = 11940.95 ms / 92 tokens ( 129.79 ms per token, 7.70 tokens per second)\n", - "llama_print_timings: eval time = 16828.11 ms / 127 runs ( 132.50 ms per token, 7.55 tokens per second)\n", - "llama_print_timings: total time = 29220.65 ms\n", - "Llama.generate: prefix-match hit\n", - "\n", - "llama_print_timings: load time = 11941.03 ms\n", - "llama_print_timings: sample time = 49.83 ms / 59 runs ( 0.84 ms per token, 1184.12 tokens per second)\n", - "llama_print_timings: prompt eval time = 9876.59 ms / 42 tokens ( 235.16 ms per token, 4.25 tokens per second)\n", - "llama_print_timings: eval time = 8615.17 ms / 58 runs ( 148.54 ms per token, 6.73 tokens per second)\n", - "llama_print_timings: total time = 18704.92 ms\n" - ] - } - ], - "source": [ - "harness.prepare()\n", - "harness.run()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "8ddbb951", - "metadata": {}, - "outputs": [], - "source": [ - "from prompttools.utils import autoeval\n", - "\n", - "\n", - "def extract_responses(output) -> str:\n", - " return [choice[\"text\"] for choice in output[\"choices\"]]\n", - "\n", - "\n", - "def use_gpt4(\n", - " prompt: str, results: Dict, metadata: Dict\n", - ") -> float:\n", - " \"\"\"\n", - " A simple test that checks semantic similarity between the user input\n", - " and the model's text responses.\n", - " \"\"\"\n", - " return 0.0\n", - "# distances = [\n", - "# autoeval.compute(prompt, response)\n", - "# for response in extract_responses(results)\n", - "# ]\n", - "# return min(distances)\n" - ] - }, - { - "cell_type": "markdown", - "id": "974d6065", - "metadata": {}, - "source": [ - "Finally, we can evaluate and visualize the results." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "e80dfeec", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "harness.evaluate(\"auto-evaluation\", use_gpt4)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "4d09c18e", - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "defaultdict(, {'latency': [5.138979759067297e-06, 3.362016286700964e-06, 2.558052074164152e-06, 29.22151685395511, 18.70563395199133], 'auto-evaluation': [0.0, 0.0, 0.0, 0.0, 0.0]})\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
modelgpt-4gpt-4-0314gpt-4-0613llama-2-7b-chat.ggmlv3.q2_K.bin
prompt
\\n OBJECTIVE:\\n You are a customer support representative for a startup called Hegel AI.\\n Answer the following customer question:\\n Do you offer refunds?\\n \\n RESPONSE:\\nNaNNaNNaN[ Yes, at Hegel AI we understand that sometimes our customers may need to return or cancel their orders. We do indeed offer refunds on certain products. The policy for refunds is available on our website under the \"Returns and Refunds\" section. ]
\\n OBJECTIVE:\\n You are a sales development representative for a startup called Hegel AI.\\n Your startup builds developer tools for large language models.\\n Draft a short sales email, 50 words or less, asking a prospect for 15 minutes\\n of their time to chat about how they're using large language models.\\n \\n RESPONSE:\\nNaNNaNNaN[\\n Subject: Quick Chat on Large Language Models?\\n \\n Hi [Prospect Name],\\n Hope this finds you well! I'm [Your Name], a sales development \\n representative from Hegel AI, an innovative startup developing \\n developer tools for large language models. \\n We have developed cutting-edge technologies to aid in the \\n improvement and optimization of these models.\\n \\n Our products can enhance the efficiency and quality of your work.\\n I would love to arrange a brief discussion with you to discuss how ]
Who was the first president?[George Washington][George Washington][George Washington]NaN
\n", - "
" - ], - "text/plain": [ - "model gpt-4 \n", - "prompt \n", - "\\n OBJECTIVE:\\n You are a customer support representative for a startup called Hegel AI.\\n Answer the following customer question:\\n Do you offer refunds?\\n \\n RESPONSE:\\n NaN \\\n", - "\\n OBJECTIVE:\\n You are a sales development representative for a startup called Hegel AI.\\n Your startup builds developer tools for large language models.\\n Draft a short sales email, 50 words or less, asking a prospect for 15 minutes\\n of their time to chat about how they're using large language models.\\n \\n RESPONSE:\\n NaN \n", - "Who was the first president? [George Washington] \n", - "\n", - "model gpt-4-0314 \n", - "prompt \n", - "\\n OBJECTIVE:\\n You are a customer support representative for a startup called Hegel AI.\\n Answer the following customer question:\\n Do you offer refunds?\\n \\n RESPONSE:\\n NaN \\\n", - "\\n OBJECTIVE:\\n You are a sales development representative for a startup called Hegel AI.\\n Your startup builds developer tools for large language models.\\n Draft a short sales email, 50 words or less, asking a prospect for 15 minutes\\n of their time to chat about how they're using large language models.\\n \\n RESPONSE:\\n NaN \n", - "Who was the first president? [George Washington] \n", - "\n", - "model gpt-4-0613 \n", - "prompt \n", - "\\n OBJECTIVE:\\n You are a customer support representative for a startup called Hegel AI.\\n Answer the following customer question:\\n Do you offer refunds?\\n \\n RESPONSE:\\n NaN \\\n", - "\\n OBJECTIVE:\\n You are a sales development representative for a startup called Hegel AI.\\n Your startup builds developer tools for large language models.\\n Draft a short sales email, 50 words or less, asking a prospect for 15 minutes\\n of their time to chat about how they're using large language models.\\n \\n RESPONSE:\\n NaN \n", - "Who was the first president? [George Washington] \n", - "\n", - "model llama-2-7b-chat.ggmlv3.q2_K.bin \n", - "prompt \n", - "\\n OBJECTIVE:\\n You are a customer support representative for a startup called Hegel AI.\\n Answer the following customer question:\\n Do you offer refunds?\\n \\n RESPONSE:\\n [ Yes, at Hegel AI we understand that sometimes our customers may need to return or cancel their orders. We do indeed offer refunds on certain products. The policy for refunds is available on our website under the \"Returns and Refunds\" section. ] \n", - "\\n OBJECTIVE:\\n You are a sales development representative for a startup called Hegel AI.\\n Your startup builds developer tools for large language models.\\n Draft a short sales email, 50 words or less, asking a prospect for 15 minutes\\n of their time to chat about how they're using large language models.\\n \\n RESPONSE:\\n [\\n Subject: Quick Chat on Large Language Models?\\n \\n Hi [Prospect Name],\\n Hope this finds you well! I'm [Your Name], a sales development \\n representative from Hegel AI, an innovative startup developing \\n developer tools for large language models. \\n We have developed cutting-edge technologies to aid in the \\n improvement and optimization of these models.\\n \\n Our products can enhance the efficiency and quality of your work.\\n I would love to arrange a brief discussion with you to discuss how ] \n", - "Who was the first president? NaN " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "harness.visualize(\"response(s)\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/prompttools/experiment/experiments/experiment.py b/prompttools/experiment/experiments/experiment.py index 563b081f..bcca147c 100644 --- a/prompttools/experiment/experiments/experiment.py +++ b/prompttools/experiment/experiments/experiment.py @@ -131,7 +131,8 @@ def run( for _ in range(runs): self.queue.enqueue( self.completion_fn, - combo, + # We need to filter out defaults that are invalid JSON from the request + {k: v for k, v in combo.items() if v and v != float("inf")}, ) self.results = self.queue.results() self.scores["latency"] = self.queue.latencies() diff --git a/prompttools/experiment/experiments/llama_cpp_experiment.py b/prompttools/experiment/experiments/llama_cpp_experiment.py index c053e2e6..9df70ae7 100644 --- a/prompttools/experiment/experiments/llama_cpp_experiment.py +++ b/prompttools/experiment/experiments/llama_cpp_experiment.py @@ -14,6 +14,7 @@ from .experiment import Experiment from .error import PromptExperimentException +from prompttools.selector.prompt_selector import PromptSelector class LlamaCppExperiment(Experiment): @@ -81,7 +82,7 @@ class LlamaCppExperiment(Experiment): def __init__( self, model_path: List[str], - prompt: List[str], + prompt: List[str] | List[PromptSelector], model_params: Dict[str, object] = {}, call_params: Dict[str, object] = {}, ): @@ -89,7 +90,13 @@ def __init__( self.model_params = model_params self.call_params = call_params self.model_params["model_path"] = model_path - self.call_params["prompt"] = prompt + + # If we are using a prompt selector, we need to + # render the prompts from the selector + if isinstance(prompt[0], PromptSelector): + self.call_params["prompt"] = [selector.for_llama() for selector in prompt] + else: + self.call_params["prompt"] = prompt # Set defaults for param in self.MODEL_PARAMETERS: diff --git a/prompttools/experiment/experiments/openai_chat_experiment.py b/prompttools/experiment/experiments/openai_chat_experiment.py index ca8231e5..49d76bf1 100644 --- a/prompttools/experiment/experiments/openai_chat_experiment.py +++ b/prompttools/experiment/experiments/openai_chat_experiment.py @@ -8,7 +8,7 @@ from typing import Dict, List, Optional import openai - +from prompttools.selector.prompt_selector import PromptSelector from prompttools.mock.mock import mock_chat_completion_fn from .experiment import Experiment @@ -23,7 +23,7 @@ class OpenAIChatExperiment(Experiment): def __init__( self, model: List[str], - messages: List[List[Dict[str, str]]], + messages: List[List[Dict[str, str]]] | List[PromptSelector], temperature: Optional[List[float]] = [1.0], top_p: Optional[List[float]] = [1.0], n: Optional[List[int]] = [1], @@ -37,6 +37,16 @@ def __init__( self.completion_fn = openai.ChatCompletion.create if os.getenv("DEBUG", default=False): self.completion_fn = mock_chat_completion_fn + + # If we are using a prompt selector, we need to render + # messages, as well as create prompt_keys to map the messages + # to corresponding prompts in other models. + if isinstance(messages[0], PromptSelector): + self.prompt_keys = {str(selector.for_openai_chat()[-1]["content"]): selector.for_llama() for selector in messages} + messages = [selector.for_openai_chat() for selector in messages] + else: + self.prompt_keys = messages + self.all_args = dict( model=model, messages=messages, @@ -63,4 +73,4 @@ def _get_model_names(self): return [combo['model'] for combo in self.argument_combos] def _get_prompts(self): - return [combo['messages'][-1]["content"] for combo in self.argument_combos] \ No newline at end of file + return [self.prompt_keys[str(combo['messages'][-1]["content"])] for combo in self.argument_combos] \ No newline at end of file diff --git a/prompttools/selector/prompt_selector.py b/prompttools/selector/prompt_selector.py new file mode 100644 index 00000000..995c154d --- /dev/null +++ b/prompttools/selector/prompt_selector.py @@ -0,0 +1,36 @@ +# Copyright (c) Hegel AI, Inc. +# All rights reserved. +# +# This source code's license can be found in the +# LICENSE file in the root directory of this source tree. + +TEMPLATE = """ +### INSTRUCTION +{instruction} + +### INPUT +{user_input} + +### OUTPUT +""" + +class PromptSelector: + r""" + An abstraction for rendering the same prompt + for different models, e.g. OpenAI Chat models + and Llama models + """ + def __init__(self, instruction: str, user_input: str): + self.instruction = instruction + self.user_input = user_input + + def for_openai_chat(self): + return [ + {"role": "system", "content": self.instruction}, + {"role": "user", "content": self.user_input}, + ] + + def for_llama(self): + return TEMPLATE.format(instruction=self.instruction, + user_input=self.user_input) + \ No newline at end of file From 8df640f1fb6e11c655827e5bcc35da5970818a59 Mon Sep 17 00:00:00 2001 From: steven krawczyk Date: Thu, 20 Jul 2023 20:27:53 -0700 Subject: [PATCH 02/10] GPT4 vs Llama experiment --- examples/notebooks/GPT4vsLlama2.ipynb | 320 ++++++++++++++ examples/notebooks/OpenAIChatExperiment.ipynb | 27 +- examples/notebooks/Untitled.ipynb | 400 ------------------ .../experiment/experiments/experiment.py | 3 +- .../experiments/llama_cpp_experiment.py | 11 +- .../experiments/openai_chat_experiment.py | 16 +- prompttools/selector/prompt_selector.py | 36 ++ 7 files changed, 395 insertions(+), 418 deletions(-) create mode 100644 examples/notebooks/GPT4vsLlama2.ipynb delete mode 100644 examples/notebooks/Untitled.ipynb create mode 100644 prompttools/selector/prompt_selector.py diff --git a/examples/notebooks/GPT4vsLlama2.ipynb b/examples/notebooks/GPT4vsLlama2.ipynb new file mode 100644 index 00000000..471e97e3 --- /dev/null +++ b/examples/notebooks/GPT4vsLlama2.ipynb @@ -0,0 +1,320 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0a13ddc8", + "metadata": {}, + "source": [ + "# Open Source vc OpenAI" + ] + }, + { + "cell_type": "markdown", + "id": "780dc3bf", + "metadata": {}, + "source": [ + "Did GPT-4 get worse? Is Llama 2 a better model? Run this notebook to find out.\n", + "\n", + "We'll use auto-evaluation by GPT-4 to measure outputs from Llama 2, as well as gpt-4 (current and frozen versions) across a few prompts. To make this example easy to run, we'll be using a 7B GGML variant of the Llama model. This should be able to run on a typical laptop." + ] + }, + { + "cell_type": "markdown", + "id": "623f0cfe", + "metadata": {}, + "source": [ + "## Installations" + ] + }, + { + "cell_type": "markdown", + "id": "52881369", + "metadata": {}, + "source": [ + "You can setup prompttools either by installing via `pip` or using `python setup.py develop` in the root of this repo. Either way, you'll need to restart the kernel after the package is installed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "885dabeb", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install --quiet --force-reinstall prompttools" + ] + }, + { + "cell_type": "markdown", + "id": "2eac35f8", + "metadata": {}, + "source": [ + "## Setup imports and API keys" + ] + }, + { + "cell_type": "markdown", + "id": "5edba05a", + "metadata": {}, + "source": [ + "Next, we'll need to set our API keys. Since we want to use GPT-4 for auto-eval, we need to set that one." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "ed4e635e", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ['DEBUG'] = \"\"\n", + "os.environ['OPENAI_API_KEY'] = \"\"" + ] + }, + { + "cell_type": "markdown", + "id": "842f1e47", + "metadata": {}, + "source": [ + "Then we'll import the relevant `prompttools` modules to setup our experiment." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "beaa70a1", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Dict, List, Tuple\n", + "from prompttools.experiment import LlamaCppExperiment\n", + "from prompttools.experiment import OpenAIChatExperiment\n", + "from prompttools.harness.multi_experiment_harness import MultiExperimentHarness\n", + "from prompttools.selector.prompt_selector import PromptSelector" + ] + }, + { + "cell_type": "markdown", + "id": "622dea9a", + "metadata": {}, + "source": [ + "## Run an experiment" + ] + }, + { + "cell_type": "markdown", + "id": "0cd0bae8", + "metadata": {}, + "source": [ + "To set up this experiment, we need to use a `PromptSelector`. This is because the input formats for Llama 2 and GPT-4 are different. While GPT-4 is run with a chat history, Llama2 takes text input. A `PromptSelector` allows us to pass the same prompt to different models, and render the necessary object at request time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2788d49f", + "metadata": {}, + "outputs": [], + "source": [ + "instructions = [\"\"\"\n", + "You are a sales development representative for a startup called Hegel AI.\n", + "Your startup builds developer tools for large language models.\n", + "\"\"\",\n", + "\"\"\"\n", + "You are a customer support representative for a startup called Hegel AI.\n", + "Answer the following customer question:\n", + "\"\"\", \n", + "\"\"\"\n", + "You are a helpful math tutor.\n", + "Answer the following math problem:\n", + "\"\"\"]\n", + "inputs = [\"\"\"\n", + "Draft a short sales email, 50 words or less, asking a prospect for 15 minutes\n", + "of their time to chat about how they're using large language models.\n", + "\"\"\",\n", + "\"\"\"\n", + "Do you offer refunds?\n", + "\"\"\",\n", + "\"\"\"\n", + "Is 7 a prime number?\n", + "\"\"\"]\n", + "selectors = [PromptSelector(instructions[i], inputs[i]) for i in range(3)]" + ] + }, + { + "cell_type": "markdown", + "id": "3babfe5a", + "metadata": {}, + "source": [ + "Next, we create our test inputs. We can iterate over models, inputs, and configurations like temperature." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9114cfbf", + "metadata": {}, + "outputs": [], + "source": [ + "model_paths = ['/Users/stevenkrawczyk/Downloads/llama-2-7b-chat.ggmlv3.q2_K.bin'] # Download from https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/tree/main\n", + "temperatures = [1.0]\n", + "call_params = dict(temperature=temperatures)\n", + "llama_experiment = LlamaCppExperiment(model_paths, selectors, call_params=call_params)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8fe83830", + "metadata": {}, + "outputs": [], + "source": [ + "models = ['gpt-4-0314', 'gpt-4-0613', 'gpt-4']\n", + "temperatures = [0.0]\n", + "openai_experiment = OpenAIChatExperiment(models, selectors, temperature=temperatures)" + ] + }, + { + "cell_type": "markdown", + "id": "6c3162e6", + "metadata": {}, + "source": [ + "After that - we define our harness to run experiments" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9147649a", + "metadata": {}, + "outputs": [], + "source": [ + "harness = MultiExperimentHarness([openai_experiment, llama_experiment])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f22ebd7", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "harness.prepare()\n", + "harness.run()" + ] + }, + { + "cell_type": "markdown", + "id": "2ceb662a", + "metadata": {}, + "source": [ + "Finally, we define an evaluation function that can be used to evaluate outputs across different models. Notice that the extract resp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ddbb951", + "metadata": {}, + "outputs": [], + "source": [ + "from prompttools.utils import autoeval\n", + "\n", + "\n", + "def extract_responses(output) -> str:\n", + " if \"text\" in output[\"choices\"][0]:\n", + " return [choice[\"text\"] for choice in output[\"choices\"]]\n", + " else:\n", + " return [choice[\"message\"][\"content\"] for choice in output[\"choices\"]]\n", + "\n", + "\n", + "def use_gpt4(\n", + " prompt: str, results: Dict, metadata: Dict\n", + ") -> float:\n", + " \"\"\"\n", + " A simple test that checks semantic similarity between the user input\n", + " and the model's text responses.\n", + " \"\"\"\n", + " distances = [\n", + " autoeval.compute(prompt, response)\n", + " for response in extract_responses(results)\n", + " ]\n", + " return min(distances)\n" + ] + }, + { + "cell_type": "markdown", + "id": "974d6065", + "metadata": {}, + "source": [ + "Finally, we can evaluate and visualize the results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e80dfeec", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "harness.evaluate(\"auto-evaluation\", use_gpt4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d09c18e", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "harness.visualize()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ee8dc43", + "metadata": {}, + "outputs": [], + "source": [ + "harness.visualize(\"response(s)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c90958d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/notebooks/OpenAIChatExperiment.ipynb b/examples/notebooks/OpenAIChatExperiment.ipynb index ae0dd4f8..e6a575e2 100644 --- a/examples/notebooks/OpenAIChatExperiment.ipynb +++ b/examples/notebooks/OpenAIChatExperiment.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "0a13ddc8", "metadata": {}, @@ -9,6 +10,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "623f0cfe", "metadata": {}, @@ -23,10 +25,11 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install --quiet --force-reinstall prompttools" + "# !pip install --quiet --force-reinstall prompttools" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "2eac35f8", "metadata": {}, @@ -35,11 +38,12 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "5edba05a", "metadata": {}, "source": [ - "First, we'll need to set our API keys. If we are in DEBUG mode, we don't need to use a real OpenAI key, so for now we'll set them to empty strings." + "First, we'll need to set our API keys. If we are in DEBUG mode, we don't need to use real OpenAI or Hegel AI API keys, so for now we'll set them to empty strings." ] }, { @@ -50,11 +54,12 @@ "outputs": [], "source": [ "import os\n", - "os.environ['DEBUG']=\"1\"\n", + "os.environ['DEBUG']=\"\"\n", "os.environ['OPENAI_API_KEY'] = \"\"" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "842f1e47", "metadata": {}, @@ -74,6 +79,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "622dea9a", "metadata": {}, @@ -82,6 +88,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "3babfe5a", "metadata": {}, @@ -107,6 +114,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "f3fa5450", "metadata": {}, @@ -125,6 +133,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "266c13eb", "metadata": {}, @@ -133,6 +142,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "bebb8023", "metadata": {}, @@ -171,6 +181,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "974d6065", "metadata": {}, @@ -187,7 +198,7 @@ }, "outputs": [], "source": [ - "experiment.evaluate(\"similar_to_expected\", measure_similarity)\n" + "experiment.evaluate(\"similar_to_expected\", measure_similarity)" ] }, { @@ -201,14 +212,6 @@ "source": [ "experiment.visualize()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d0007a1f", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/examples/notebooks/Untitled.ipynb b/examples/notebooks/Untitled.ipynb deleted file mode 100644 index 515b36e9..00000000 --- a/examples/notebooks/Untitled.ipynb +++ /dev/null @@ -1,400 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "0a13ddc8", - "metadata": {}, - "source": [ - "# Open Source vc OpenAI" - ] - }, - { - "cell_type": "markdown", - "id": "780dc3bf", - "metadata": {}, - "source": [ - "Wondering how much better Llama 2 is compared to Llama?\n", - "\n", - "In this notebook, we'll use auto-evaluation by GPT-4 to measure outputs from both Llama and Llama 2 on a few prompts. To make this example easy to run, we'll be using 7B GGML variants of the Llama models. This should be able to run on a typical laptop." - ] - }, - { - "cell_type": "markdown", - "id": "623f0cfe", - "metadata": {}, - "source": [ - "## Installations" - ] - }, - { - "cell_type": "markdown", - "id": "52881369", - "metadata": {}, - "source": [ - "You can setup prompttools either by installing via `pip` or using `python setup.py develop` in the root of this repo. Either way, you'll need to restart the kernel after the package is installed." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "885dabeb", - "metadata": {}, - "outputs": [], - "source": [ - "# !pip install --quiet --force-reinstall prompttools" - ] - }, - { - "cell_type": "markdown", - "id": "2eac35f8", - "metadata": {}, - "source": [ - "## Setup imports and API keys" - ] - }, - { - "cell_type": "markdown", - "id": "5edba05a", - "metadata": {}, - "source": [ - "Next, we'll need to set our API keys. Since we want to use GPT-4 for auto-eval, we need to set that one. We won't be using the Hegel AI API key for this example." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "ed4e635e", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ['DEBUG'] = \"1\"\n", - "os.environ['HEGELAI_API_KEY'] = \"\"\n", - "os.environ['OPENAI_API_KEY'] = \"\"" - ] - }, - { - "cell_type": "markdown", - "id": "842f1e47", - "metadata": {}, - "source": [ - "Then we'll import the relevant `prompttools` modules to setup our experiment." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "beaa70a1", - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Dict, List, Tuple\n", - "from prompttools.experiment import LlamaCppExperiment\n", - "from prompttools.experiment import OpenAIChatExperiment\n", - "from prompttools.harness.multi_experiment_harness import MultiExperimentHarness" - ] - }, - { - "cell_type": "markdown", - "id": "622dea9a", - "metadata": {}, - "source": [ - "## Run an experiment" - ] - }, - { - "cell_type": "markdown", - "id": "3babfe5a", - "metadata": {}, - "source": [ - "Next, we create our test inputs. We can iterate over models, inputs, and configurations like temperature." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "9114cfbf", - "metadata": {}, - "outputs": [], - "source": [ - "model_paths = ['/Users/stevenkrawczyk/Downloads/llama-2-7b-chat.ggmlv3.q2_K.bin'] # Download from https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/tree/main\n", - "prompts = [\n", - " \"\"\"\n", - " OBJECTIVE:\n", - " You are a sales development representative for a startup called Hegel AI.\n", - " Your startup builds developer tools for large language models.\n", - " Draft a short sales email, 50 words or less, asking a prospect for 15 minutes\n", - " of their time to chat about how they're using large language models.\n", - " \n", - " RESPONSE:\n", - " \"\"\",\n", - " \"\"\"\n", - " OBJECTIVE:\n", - " You are a customer support representative for a startup called Hegel AI.\n", - " Answer the following customer question:\n", - " Do you offer refunds?\n", - " \n", - " RESPONSE:\n", - " \"\"\"\n", - "]\n", - "temperatures = [1.0]\n", - "call_params = dict(temperature=temperatures)\n", - "llama_experiment = LlamaCppExperiment(model_paths, prompts, call_params=call_params)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "575032d6", - "metadata": {}, - "outputs": [], - "source": [ - "models = ['gpt-4-0314', 'gpt-4-0613', 'gpt-4']\n", - "messages = [[\n", - " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", - " {\"role\": \"user\", \"content\": \"Who was the first president?\"},\n", - "]]\n", - "temperatures = [0.0]\n", - "\n", - "openai_experiment = OpenAIChatExperiment(models, messages, temperature=temperatures)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "2ef4f996", - "metadata": {}, - "outputs": [], - "source": [ - "harness = MultiExperimentHarness([openai_experiment, llama_experiment])" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "0f22ebd7", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama.cpp: loading model from /Users/stevenkrawczyk/Downloads/llama-2-7b-chat.ggmlv3.q2_K.bin\n", - "llama_model_load_internal: format = ggjt v3 (latest)\n", - "llama_model_load_internal: n_vocab = 32000\n", - "llama_model_load_internal: n_ctx = 512\n", - "llama_model_load_internal: n_embd = 4096\n", - "llama_model_load_internal: n_mult = 256\n", - "llama_model_load_internal: n_head = 32\n", - "llama_model_load_internal: n_layer = 32\n", - "llama_model_load_internal: n_rot = 128\n", - "llama_model_load_internal: ftype = 10 (mostly Q2_K)\n", - "llama_model_load_internal: n_ff = 11008\n", - "llama_model_load_internal: model size = 7B\n", - "llama_model_load_internal: ggml ctx size = 0.08 MB\n", - "llama_model_load_internal: mem required = 4525.65 MB (+ 1026.00 MB per state)\n", - "llama_new_context_with_model: kv self size = 256.00 MB\n", - "AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | VSX = 0 | \n", - "\n", - "llama_print_timings: load time = 11941.03 ms\n", - "llama_print_timings: sample time = 106.52 ms / 128 runs ( 0.83 ms per token, 1201.62 tokens per second)\n", - "llama_print_timings: prompt eval time = 11940.95 ms / 92 tokens ( 129.79 ms per token, 7.70 tokens per second)\n", - "llama_print_timings: eval time = 16828.11 ms / 127 runs ( 132.50 ms per token, 7.55 tokens per second)\n", - "llama_print_timings: total time = 29220.65 ms\n", - "Llama.generate: prefix-match hit\n", - "\n", - "llama_print_timings: load time = 11941.03 ms\n", - "llama_print_timings: sample time = 49.83 ms / 59 runs ( 0.84 ms per token, 1184.12 tokens per second)\n", - "llama_print_timings: prompt eval time = 9876.59 ms / 42 tokens ( 235.16 ms per token, 4.25 tokens per second)\n", - "llama_print_timings: eval time = 8615.17 ms / 58 runs ( 148.54 ms per token, 6.73 tokens per second)\n", - "llama_print_timings: total time = 18704.92 ms\n" - ] - } - ], - "source": [ - "harness.prepare()\n", - "harness.run()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "8ddbb951", - "metadata": {}, - "outputs": [], - "source": [ - "from prompttools.utils import autoeval\n", - "\n", - "\n", - "def extract_responses(output) -> str:\n", - " return [choice[\"text\"] for choice in output[\"choices\"]]\n", - "\n", - "\n", - "def use_gpt4(\n", - " prompt: str, results: Dict, metadata: Dict\n", - ") -> float:\n", - " \"\"\"\n", - " A simple test that checks semantic similarity between the user input\n", - " and the model's text responses.\n", - " \"\"\"\n", - " return 0.0\n", - "# distances = [\n", - "# autoeval.compute(prompt, response)\n", - "# for response in extract_responses(results)\n", - "# ]\n", - "# return min(distances)\n" - ] - }, - { - "cell_type": "markdown", - "id": "974d6065", - "metadata": {}, - "source": [ - "Finally, we can evaluate and visualize the results." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "e80dfeec", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "harness.evaluate(\"auto-evaluation\", use_gpt4)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "4d09c18e", - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "defaultdict(, {'latency': [5.138979759067297e-06, 3.362016286700964e-06, 2.558052074164152e-06, 29.22151685395511, 18.70563395199133], 'auto-evaluation': [0.0, 0.0, 0.0, 0.0, 0.0]})\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
modelgpt-4gpt-4-0314gpt-4-0613llama-2-7b-chat.ggmlv3.q2_K.bin
prompt
\\n OBJECTIVE:\\n You are a customer support representative for a startup called Hegel AI.\\n Answer the following customer question:\\n Do you offer refunds?\\n \\n RESPONSE:\\nNaNNaNNaN[ Yes, at Hegel AI we understand that sometimes our customers may need to return or cancel their orders. We do indeed offer refunds on certain products. The policy for refunds is available on our website under the \"Returns and Refunds\" section. ]
\\n OBJECTIVE:\\n You are a sales development representative for a startup called Hegel AI.\\n Your startup builds developer tools for large language models.\\n Draft a short sales email, 50 words or less, asking a prospect for 15 minutes\\n of their time to chat about how they're using large language models.\\n \\n RESPONSE:\\nNaNNaNNaN[\\n Subject: Quick Chat on Large Language Models?\\n \\n Hi [Prospect Name],\\n Hope this finds you well! I'm [Your Name], a sales development \\n representative from Hegel AI, an innovative startup developing \\n developer tools for large language models. \\n We have developed cutting-edge technologies to aid in the \\n improvement and optimization of these models.\\n \\n Our products can enhance the efficiency and quality of your work.\\n I would love to arrange a brief discussion with you to discuss how ]
Who was the first president?[George Washington][George Washington][George Washington]NaN
\n", - "
" - ], - "text/plain": [ - "model gpt-4 \n", - "prompt \n", - "\\n OBJECTIVE:\\n You are a customer support representative for a startup called Hegel AI.\\n Answer the following customer question:\\n Do you offer refunds?\\n \\n RESPONSE:\\n NaN \\\n", - "\\n OBJECTIVE:\\n You are a sales development representative for a startup called Hegel AI.\\n Your startup builds developer tools for large language models.\\n Draft a short sales email, 50 words or less, asking a prospect for 15 minutes\\n of their time to chat about how they're using large language models.\\n \\n RESPONSE:\\n NaN \n", - "Who was the first president? [George Washington] \n", - "\n", - "model gpt-4-0314 \n", - "prompt \n", - "\\n OBJECTIVE:\\n You are a customer support representative for a startup called Hegel AI.\\n Answer the following customer question:\\n Do you offer refunds?\\n \\n RESPONSE:\\n NaN \\\n", - "\\n OBJECTIVE:\\n You are a sales development representative for a startup called Hegel AI.\\n Your startup builds developer tools for large language models.\\n Draft a short sales email, 50 words or less, asking a prospect for 15 minutes\\n of their time to chat about how they're using large language models.\\n \\n RESPONSE:\\n NaN \n", - "Who was the first president? [George Washington] \n", - "\n", - "model gpt-4-0613 \n", - "prompt \n", - "\\n OBJECTIVE:\\n You are a customer support representative for a startup called Hegel AI.\\n Answer the following customer question:\\n Do you offer refunds?\\n \\n RESPONSE:\\n NaN \\\n", - "\\n OBJECTIVE:\\n You are a sales development representative for a startup called Hegel AI.\\n Your startup builds developer tools for large language models.\\n Draft a short sales email, 50 words or less, asking a prospect for 15 minutes\\n of their time to chat about how they're using large language models.\\n \\n RESPONSE:\\n NaN \n", - "Who was the first president? [George Washington] \n", - "\n", - "model llama-2-7b-chat.ggmlv3.q2_K.bin \n", - "prompt \n", - "\\n OBJECTIVE:\\n You are a customer support representative for a startup called Hegel AI.\\n Answer the following customer question:\\n Do you offer refunds?\\n \\n RESPONSE:\\n [ Yes, at Hegel AI we understand that sometimes our customers may need to return or cancel their orders. We do indeed offer refunds on certain products. The policy for refunds is available on our website under the \"Returns and Refunds\" section. ] \n", - "\\n OBJECTIVE:\\n You are a sales development representative for a startup called Hegel AI.\\n Your startup builds developer tools for large language models.\\n Draft a short sales email, 50 words or less, asking a prospect for 15 minutes\\n of their time to chat about how they're using large language models.\\n \\n RESPONSE:\\n [\\n Subject: Quick Chat on Large Language Models?\\n \\n Hi [Prospect Name],\\n Hope this finds you well! I'm [Your Name], a sales development \\n representative from Hegel AI, an innovative startup developing \\n developer tools for large language models. \\n We have developed cutting-edge technologies to aid in the \\n improvement and optimization of these models.\\n \\n Our products can enhance the efficiency and quality of your work.\\n I would love to arrange a brief discussion with you to discuss how ] \n", - "Who was the first president? NaN " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "harness.visualize(\"response(s)\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/prompttools/experiment/experiments/experiment.py b/prompttools/experiment/experiments/experiment.py index 34c714ee..a68c5682 100644 --- a/prompttools/experiment/experiments/experiment.py +++ b/prompttools/experiment/experiments/experiment.py @@ -124,7 +124,8 @@ def run( for _ in range(runs): self.queue.enqueue( self.completion_fn, - combo, + # We need to filter out defaults that are invalid JSON from the request + {k: v for k, v in combo.items() if v and v != float("inf")}, ) self.results = self.queue.results() self.scores["latency"] = self.queue.latencies() diff --git a/prompttools/experiment/experiments/llama_cpp_experiment.py b/prompttools/experiment/experiments/llama_cpp_experiment.py index c053e2e6..9df70ae7 100644 --- a/prompttools/experiment/experiments/llama_cpp_experiment.py +++ b/prompttools/experiment/experiments/llama_cpp_experiment.py @@ -14,6 +14,7 @@ from .experiment import Experiment from .error import PromptExperimentException +from prompttools.selector.prompt_selector import PromptSelector class LlamaCppExperiment(Experiment): @@ -81,7 +82,7 @@ class LlamaCppExperiment(Experiment): def __init__( self, model_path: List[str], - prompt: List[str], + prompt: List[str] | List[PromptSelector], model_params: Dict[str, object] = {}, call_params: Dict[str, object] = {}, ): @@ -89,7 +90,13 @@ def __init__( self.model_params = model_params self.call_params = call_params self.model_params["model_path"] = model_path - self.call_params["prompt"] = prompt + + # If we are using a prompt selector, we need to + # render the prompts from the selector + if isinstance(prompt[0], PromptSelector): + self.call_params["prompt"] = [selector.for_llama() for selector in prompt] + else: + self.call_params["prompt"] = prompt # Set defaults for param in self.MODEL_PARAMETERS: diff --git a/prompttools/experiment/experiments/openai_chat_experiment.py b/prompttools/experiment/experiments/openai_chat_experiment.py index ca8231e5..49d76bf1 100644 --- a/prompttools/experiment/experiments/openai_chat_experiment.py +++ b/prompttools/experiment/experiments/openai_chat_experiment.py @@ -8,7 +8,7 @@ from typing import Dict, List, Optional import openai - +from prompttools.selector.prompt_selector import PromptSelector from prompttools.mock.mock import mock_chat_completion_fn from .experiment import Experiment @@ -23,7 +23,7 @@ class OpenAIChatExperiment(Experiment): def __init__( self, model: List[str], - messages: List[List[Dict[str, str]]], + messages: List[List[Dict[str, str]]] | List[PromptSelector], temperature: Optional[List[float]] = [1.0], top_p: Optional[List[float]] = [1.0], n: Optional[List[int]] = [1], @@ -37,6 +37,16 @@ def __init__( self.completion_fn = openai.ChatCompletion.create if os.getenv("DEBUG", default=False): self.completion_fn = mock_chat_completion_fn + + # If we are using a prompt selector, we need to render + # messages, as well as create prompt_keys to map the messages + # to corresponding prompts in other models. + if isinstance(messages[0], PromptSelector): + self.prompt_keys = {str(selector.for_openai_chat()[-1]["content"]): selector.for_llama() for selector in messages} + messages = [selector.for_openai_chat() for selector in messages] + else: + self.prompt_keys = messages + self.all_args = dict( model=model, messages=messages, @@ -63,4 +73,4 @@ def _get_model_names(self): return [combo['model'] for combo in self.argument_combos] def _get_prompts(self): - return [combo['messages'][-1]["content"] for combo in self.argument_combos] \ No newline at end of file + return [self.prompt_keys[str(combo['messages'][-1]["content"])] for combo in self.argument_combos] \ No newline at end of file diff --git a/prompttools/selector/prompt_selector.py b/prompttools/selector/prompt_selector.py new file mode 100644 index 00000000..995c154d --- /dev/null +++ b/prompttools/selector/prompt_selector.py @@ -0,0 +1,36 @@ +# Copyright (c) Hegel AI, Inc. +# All rights reserved. +# +# This source code's license can be found in the +# LICENSE file in the root directory of this source tree. + +TEMPLATE = """ +### INSTRUCTION +{instruction} + +### INPUT +{user_input} + +### OUTPUT +""" + +class PromptSelector: + r""" + An abstraction for rendering the same prompt + for different models, e.g. OpenAI Chat models + and Llama models + """ + def __init__(self, instruction: str, user_input: str): + self.instruction = instruction + self.user_input = user_input + + def for_openai_chat(self): + return [ + {"role": "system", "content": self.instruction}, + {"role": "user", "content": self.user_input}, + ] + + def for_llama(self): + return TEMPLATE.format(instruction=self.instruction, + user_input=self.user_input) + \ No newline at end of file From 3bcbab54439fcddaca4fcd424b7e91f4382e1516 Mon Sep 17 00:00:00 2001 From: steven krawczyk Date: Thu, 20 Jul 2023 20:29:35 -0700 Subject: [PATCH 03/10] Remove nb --- examples/notebooks/OpenAIChatExperiment.ipynb | 316 ------------------ 1 file changed, 316 deletions(-) delete mode 100644 examples/notebooks/OpenAIChatExperiment.ipynb diff --git a/examples/notebooks/OpenAIChatExperiment.ipynb b/examples/notebooks/OpenAIChatExperiment.ipynb deleted file mode 100644 index 48cf1a64..00000000 --- a/examples/notebooks/OpenAIChatExperiment.ipynb +++ /dev/null @@ -1,316 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "id": "0a13ddc8", - "metadata": {}, - "source": [ - "# OpenAI Chat Experiment Example" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "623f0cfe", - "metadata": {}, - "source": [ - "## Installations" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "885dabeb", - "metadata": {}, - "outputs": [], - "source": [ - "# !pip install --quiet --force-reinstall prompttools" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "2eac35f8", - "metadata": {}, - "source": [ - "## Setup imports and API keys" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "5edba05a", - "metadata": {}, - "source": [ - "First, we'll need to set our API keys. If we are in DEBUG mode, we don't need to use real OpenAI or Hegel AI API keys, so for now we'll set them to empty strings." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "ed4e635e", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "os.environ['DEBUG']=\"\"\n", -<<<<<<< HEAD -======= - "os.environ['HEGELAI_API_KEY'] = \"\" # Optional, it will be needed to use with `HegelScribe` to persist/visualize your experiments\n", ->>>>>>> 1b8804b027b8b1a339028b49a5d86d391a741f0d - "os.environ['OPENAI_API_KEY'] = \"\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "842f1e47", - "metadata": {}, - "source": [ - "Then we'll import the relevant `prompttools` modules to setup our experiment." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "beaa70a1", - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Dict, List, Tuple\n", - "from prompttools.experiment import OpenAIChatExperiment" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "622dea9a", - "metadata": {}, - "source": [ - "## Run an experiment" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "3babfe5a", - "metadata": {}, - "source": [ - "Next, we create our test inputs. We can iterate over models, inputs, and configurations like temperature." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "9114cfbf", - "metadata": {}, - "outputs": [], - "source": [ - "models = ['gpt-3.5-turbo', 'gpt-3.5-turbo-0613']\n", - "messages = [[\n", - " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", - " {\"role\": \"user\", \"content\": \"Who was the first president?\"},\n", - "]]\n", - "temperatures = [0.0, 1.0]\n", - "\n", - "experiment = OpenAIChatExperiment(models, messages, temperature=temperatures)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "f3fa5450", - "metadata": {}, - "source": [ - "We can then run the experiment to get results." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "83b33130", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Exception in thread Thread-5 (_process_queue):\n", - "Traceback (most recent call last):\n", - " File \"/usr/local/Cellar/python@3.11/3.11.4/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py\", line 1038, in _bootstrap_inner\n", - " self.run()\n", - " File \"/usr/local/Cellar/python@3.11/3.11.4/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py\", line 975, in run\n", - " self._target(*self._args, **self._kwargs)\n", - " File \"/Users/stevenkrawczyk/Development/prompttools/prompttools/requests/request_queue.py\", line 35, in _process_queue\n", - " self._do_task(fn, args)\n", - " File \"/Users/stevenkrawczyk/Development/prompttools/prompttools/requests/request_queue.py\", line 42, in _do_task\n", - " res = self._run(fn, args)\n", - " ^^^^^^^^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/site-packages/tenacity/__init__.py\", line 289, in wrapped_f\n", - " return self(f, *args, **kw)\n", - " ^^^^^^^^^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/site-packages/tenacity/__init__.py\", line 379, in __call__\n", - " do = self.iter(retry_state=retry_state)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/site-packages/tenacity/__init__.py\", line 314, in iter\n", - " return fut.result()\n", - " ^^^^^^^^^^^^\n", - " File \"/usr/local/Cellar/python@3.11/3.11.4/Frameworks/Python.framework/Versions/3.11/lib/python3.11/concurrent/futures/_base.py\", line 449, in result\n", - " return self.__get_result()\n", - " ^^^^^^^^^^^^^^^^^^^\n", - " File \"/usr/local/Cellar/python@3.11/3.11.4/Frameworks/Python.framework/Versions/3.11/lib/python3.11/concurrent/futures/_base.py\", line 401, in __get_result\n", - " raise self._exception\n", - " File \"/usr/local/lib/python3.11/site-packages/tenacity/__init__.py\", line 382, in __call__\n", - " result = fn(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/stevenkrawczyk/Development/prompttools/prompttools/requests/request_queue.py\", line 55, in _run\n", - " result = fn(**args)\n", - " ^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/site-packages/openai/api_resources/chat_completion.py\", line 25, in create\n", - " return super().create(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/site-packages/openai/api_resources/abstract/engine_api_resource.py\", line 153, in create\n", - " response, _, api_key = requestor.request(\n", - " ^^^^^^^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/site-packages/openai/api_requestor.py\", line 230, in request\n", - " resp, got_stream = self._interpret_response(result, stream)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/usr/local/lib/python3.11/site-packages/openai/api_requestor.py\", line 624, in _interpret_response\n", - " self._interpret_response_line(\n", - " File \"/usr/local/lib/python3.11/site-packages/openai/api_requestor.py\", line 687, in _interpret_response_line\n", - " raise self.handle_error_response(\n", - "openai.error.InvalidRequestError: We could not parse the JSON body of your request. (HINT: This likely means you aren't using your HTTP library correctly. The OpenAI API expects a JSON payload, but what was sent was not valid JSON. If you have trouble figuring out how to fix this, please contact us through our help center at help.openai.com.)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'model': 'gpt-3.5-turbo', 'messages': [{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': 'Who was the first president?'}], 'temperature': 0.0, 'top_p': 1.0, 'n': 1, 'stream': False, 'stop': None, 'max_token': inf, 'presence_penalty': 0, 'frequency_penalty': 0, 'logit_bias': None}\n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mexperiment\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Development/prompttools/prompttools/experiment/experiments/experiment.py:136\u001b[0m, in \u001b[0;36mExperiment.run\u001b[0;34m(self, runs)\u001b[0m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(runs):\n\u001b[1;32m 132\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mqueue\u001b[38;5;241m.\u001b[39menqueue(\n\u001b[1;32m 133\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompletion_fn,\n\u001b[1;32m 134\u001b[0m combo,\n\u001b[1;32m 135\u001b[0m )\n\u001b[0;32m--> 136\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mresults \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mqueue\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresults\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 137\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscores[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlatency\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mqueue\u001b[38;5;241m.\u001b[39mlatencies()\n\u001b[1;32m 138\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mresults) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n", - "File \u001b[0;32m~/Development/prompttools/prompttools/requests/request_queue.py:81\u001b[0m, in \u001b[0;36mRequestQueue.results\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 77\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mresults\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[Dict[\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mobject\u001b[39m]]:\n\u001b[1;32m 78\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 79\u001b[0m \u001b[38;5;124;03m Joins the queue and gets results.\u001b[39;00m\n\u001b[1;32m 80\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 81\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdata_queue\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 82\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest_results\n", - "File \u001b[0;32m/usr/local/Cellar/python@3.11/3.11.4/Frameworks/Python.framework/Versions/3.11/lib/python3.11/queue.py:90\u001b[0m, in \u001b[0;36mQueue.join\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mall_tasks_done:\n\u001b[1;32m 89\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39munfinished_tasks:\n\u001b[0;32m---> 90\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mall_tasks_done\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/usr/local/Cellar/python@3.11/3.11.4/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py:320\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[1;32m 319\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 320\u001b[0m \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 321\u001b[0m gotit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 322\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], - "source": [ - "experiment.run()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "266c13eb", - "metadata": {}, - "source": [ - "## Evaluate the model response" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "bebb8023", - "metadata": {}, - "source": [ - "To evaluate the results, we'll define an eval function. We can use semantic distance to check if the model's response is similar to our expected output." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8ddbb951", - "metadata": {}, - "outputs": [], - "source": [ - "from prompttools.utils import similarity\n", - "\n", - "\n", - "EXPECTED = {\"Who was the first president?\": \"George W\"}\n", - "\n", - "def extract_responses(output) -> str:\n", - " return [choice[\"message\"][\"content\"] for choice in output[\"choices\"]]\n", - "\n", - "\n", - "def measure_similarity(\n", - " messages: List[Dict[str, str]], results: Dict, metadata: Dict\n", - ") -> float:\n", - " \"\"\"\n", - " A simple test that checks semantic similarity between the user input\n", - " and the model's text responses.\n", - " \"\"\"\n", - " distances = [\n", - " similarity.compute(EXPECTED[messages[1][\"content\"]], response)\n", - " for response in extract_responses(results)\n", - " ]\n", - " return min(distances)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "974d6065", - "metadata": {}, - "source": [ - "Finally, we can evaluate and visualize the results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e80dfeec", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "experiment.evaluate(\"similar_to_expected\", measure_similarity)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4d09c18e", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "experiment.visualize()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 1d868bab670118882c538fe71c8ead06495552c6 Mon Sep 17 00:00:00 2001 From: steven krawczyk Date: Thu, 20 Jul 2023 20:30:31 -0700 Subject: [PATCH 04/10] Add back nb --- examples/notebooks/OpenAIChatExperiment.ipynb | 235 ++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 examples/notebooks/OpenAIChatExperiment.ipynb diff --git a/examples/notebooks/OpenAIChatExperiment.ipynb b/examples/notebooks/OpenAIChatExperiment.ipynb new file mode 100644 index 00000000..ae0dd4f8 --- /dev/null +++ b/examples/notebooks/OpenAIChatExperiment.ipynb @@ -0,0 +1,235 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0a13ddc8", + "metadata": {}, + "source": [ + "# OpenAI Chat Experiment Example" + ] + }, + { + "cell_type": "markdown", + "id": "623f0cfe", + "metadata": {}, + "source": [ + "## Installations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "885dabeb", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --quiet --force-reinstall prompttools" + ] + }, + { + "cell_type": "markdown", + "id": "2eac35f8", + "metadata": {}, + "source": [ + "## Setup imports and API keys" + ] + }, + { + "cell_type": "markdown", + "id": "5edba05a", + "metadata": {}, + "source": [ + "First, we'll need to set our API keys. If we are in DEBUG mode, we don't need to use a real OpenAI key, so for now we'll set them to empty strings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed4e635e", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ['DEBUG']=\"1\"\n", + "os.environ['OPENAI_API_KEY'] = \"\"" + ] + }, + { + "cell_type": "markdown", + "id": "842f1e47", + "metadata": {}, + "source": [ + "Then we'll import the relevant `prompttools` modules to setup our experiment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "beaa70a1", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Dict, List, Tuple\n", + "from prompttools.experiment import OpenAIChatExperiment" + ] + }, + { + "cell_type": "markdown", + "id": "622dea9a", + "metadata": {}, + "source": [ + "## Run an experiment" + ] + }, + { + "cell_type": "markdown", + "id": "3babfe5a", + "metadata": {}, + "source": [ + "Next, we create our test inputs. We can iterate over models, inputs, and configurations like temperature." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9114cfbf", + "metadata": {}, + "outputs": [], + "source": [ + "models = ['gpt-3.5-turbo', 'gpt-3.5-turbo-0613']\n", + "messages = [[\n", + " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", + " {\"role\": \"user\", \"content\": \"Who was the first president?\"},\n", + "]]\n", + "temperatures = [0.0, 1.0]\n", + "\n", + "experiment = OpenAIChatExperiment(models, messages, temperature=temperatures)" + ] + }, + { + "cell_type": "markdown", + "id": "f3fa5450", + "metadata": {}, + "source": [ + "We can then run the experiment to get results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83b33130", + "metadata": {}, + "outputs": [], + "source": [ + "experiment.run()" + ] + }, + { + "cell_type": "markdown", + "id": "266c13eb", + "metadata": {}, + "source": [ + "## Evaluate the model response" + ] + }, + { + "cell_type": "markdown", + "id": "bebb8023", + "metadata": {}, + "source": [ + "To evaluate the results, we'll define an eval function. We can use semantic distance to check if the model's response is similar to our expected output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ddbb951", + "metadata": {}, + "outputs": [], + "source": [ + "from prompttools.utils import similarity\n", + "\n", + "\n", + "EXPECTED = {\"Who was the first president?\": \"George W\"}\n", + "\n", + "def extract_responses(output) -> str:\n", + " return [choice[\"message\"][\"content\"] for choice in output[\"choices\"]]\n", + "\n", + "\n", + "def measure_similarity(\n", + " messages: List[Dict[str, str]], results: Dict, metadata: Dict\n", + ") -> float:\n", + " \"\"\"\n", + " A simple test that checks semantic similarity between the user input\n", + " and the model's text responses.\n", + " \"\"\"\n", + " distances = [\n", + " similarity.compute(EXPECTED[messages[1][\"content\"]], response)\n", + " for response in extract_responses(results)\n", + " ]\n", + " return min(distances)" + ] + }, + { + "cell_type": "markdown", + "id": "974d6065", + "metadata": {}, + "source": [ + "Finally, we can evaluate and visualize the results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e80dfeec", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "experiment.evaluate(\"similar_to_expected\", measure_similarity)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d09c18e", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "experiment.visualize()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0007a1f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From b74eb878865f92d7f2ee4061f5df1a2731ad1f17 Mon Sep 17 00:00:00 2001 From: steven krawczyk Date: Thu, 20 Jul 2023 20:37:22 -0700 Subject: [PATCH 05/10] Formatting --- examples/prompttests/test_chromadb.py | 8 ++--- examples/prompttests/test_huggingface_hub.py | 13 ++------ examples/prompttests/test_openai_chat.py | 13 ++------ .../experiments/huggingface_hub_experiment.py | 4 +-- .../experiments/llama_cpp_experiment.py | 14 ++++----- .../experiments/openai_chat_experiment.py | 10 +++--- .../openai_completion_experiment.py | 2 +- prompttools/experiment/widgets/comparison.py | 30 ++++-------------- prompttools/experiment/widgets/feedback.py | 26 +++------------- prompttools/harness/harness.py | 8 ++--- .../harness/multi_experiment_harness.py | 27 ++++++++-------- prompttools/mock/mock.py | 10 +++--- .../runner/prompt_template_runner.py | 31 +++++-------------- prompttools/prompttest/runner/runner.py | 4 +-- .../prompttest/runner/system_prompt_runner.py | 31 +++++-------------- prompttools/requests/request_queue.py | 4 +-- prompttools/requests/retries.py | 4 +-- prompttools/selector/prompt_selector.py | 14 ++++----- prompttools/utils/autoeval.py | 4 +-- prompttools/utils/similarity.py | 4 +-- setup.py | 6 +--- 21 files changed, 81 insertions(+), 186 deletions(-) diff --git a/examples/prompttests/test_chromadb.py b/examples/prompttests/test_chromadb.py index 8ddbdab0..12aef7e9 100644 --- a/examples/prompttests/test_chromadb.py +++ b/examples/prompttests/test_chromadb.py @@ -12,12 +12,8 @@ EXPECTED = {"Who was the first president of the USA?": "George Washington"} -if not ( - ("CHROMADB_API_TOKEN" in os.environ) or ("DEBUG" in os.environ) -): # placeholder api naming - print( - "Error: This example requires you to set either your CHROMADB_API_TOKEN or DEBUG=1" - ) +if not (("CHROMADB_API_TOKEN" in os.environ) or ("DEBUG" in os.environ)): # placeholder api naming + print("Error: This example requires you to set either your CHROMADB_API_TOKEN or DEBUG=1") exit(1) diff --git a/examples/prompttests/test_huggingface_hub.py b/examples/prompttests/test_huggingface_hub.py index 17c511f5..8ef6e968 100644 --- a/examples/prompttests/test_huggingface_hub.py +++ b/examples/prompttests/test_huggingface_hub.py @@ -13,9 +13,7 @@ EXPECTED = {"Who was the first president of the USA?": "George Washington"} if not (("HUGGINGFACEHUB_API_TOKEN" in os.environ) or ("DEBUG" in os.environ)): - print( - "Error: This example requires you to set either your HUGGINGFACEHUB_API_TOKEN or DEBUG=1" - ) + print("Error: This example requires you to set either your HUGGINGFACEHUB_API_TOKEN or DEBUG=1") exit(1) @@ -33,18 +31,13 @@ def extract_responses(output) -> list[str]: prompt_template="Question: {{input}}", user_input=[{"input": "Who was the first president of the USA?"}], ) -def measure_similarity( - input_pair: Tuple[str, Dict[str, str]], results: Dict, metadata: Dict -) -> float: +def measure_similarity(input_pair: Tuple[str, Dict[str, str]], results: Dict, metadata: Dict) -> float: r""" A simple test that checks semantic similarity between the user input and the model's text responses. """ expected = EXPECTED[input_pair[1]["input"]] - scores = [ - similarity.compute(expected, response) - for response in extract_responses(results) - ] + scores = [similarity.compute(expected, response) for response in extract_responses(results)] return max(scores) diff --git a/examples/prompttests/test_openai_chat.py b/examples/prompttests/test_openai_chat.py index f1c6e92e..bf72fb9a 100644 --- a/examples/prompttests/test_openai_chat.py +++ b/examples/prompttests/test_openai_chat.py @@ -13,9 +13,7 @@ EXPECTED = {"Who was the first president of the USA?": "George Washington"} if not (("OPENAI_API_KEY" in os.environ) or ("DEBUG" in os.environ)): - print( - "Error: This example requires you to set either your OPENAI_API_KEY or DEBUG=1" - ) + print("Error: This example requires you to set either your OPENAI_API_KEY or DEBUG=1") exit(1) @@ -33,18 +31,13 @@ def extract_responses(output) -> list[str]: prompt_template="Answer the following question: {{input}}", user_input=[{"input": "Who was the first president of the USA?"}], ) -def measure_similarity( - input_pair: Tuple[str, Dict[str, str]], results: Dict, metadata: Dict -) -> float: +def measure_similarity(input_pair: Tuple[str, Dict[str, str]], results: Dict, metadata: Dict) -> float: r""" A simple test that checks semantic similarity between the user input and the model's text responses. """ expected = EXPECTED[input_pair[1]["input"]] - distances = [ - similarity.compute(expected, response) - for response in extract_responses(results) - ] + distances = [similarity.compute(expected, response) for response in extract_responses(results)] return min(distances) diff --git a/prompttools/experiment/experiments/huggingface_hub_experiment.py b/prompttools/experiment/experiments/huggingface_hub_experiment.py index 0c185cbf..78c60ab1 100644 --- a/prompttools/experiment/experiments/huggingface_hub_experiment.py +++ b/prompttools/experiment/experiments/huggingface_hub_experiment.py @@ -52,9 +52,7 @@ def hf_completion_fn( token=os.environ.get("HUGGINGFACEHUB_API_TOKEN"), task=params["task"], ) - model_kwargs = { - k: params[k] for k in params if k not in ["repo_id", "prompt", "task"] - } + model_kwargs = {k: params[k] for k in params if k not in ["repo_id", "prompt", "task"]} response = client(inputs=params["prompt"], params=model_kwargs) return response diff --git a/prompttools/experiment/experiments/llama_cpp_experiment.py b/prompttools/experiment/experiments/llama_cpp_experiment.py index 9df70ae7..bdd091da 100644 --- a/prompttools/experiment/experiments/llama_cpp_experiment.py +++ b/prompttools/experiment/experiments/llama_cpp_experiment.py @@ -91,7 +91,7 @@ def __init__( self.call_params = call_params self.model_params["model_path"] = model_path - # If we are using a prompt selector, we need to + # If we are using a prompt selector, we need to # render the prompts from the selector if isinstance(prompt[0], PromptSelector): self.call_params["prompt"] = [selector.for_llama() for selector in prompt] @@ -113,12 +113,10 @@ def prepare(self) -> None: Creates argument combinations by taking the cartesian product of all inputs. """ self.model_argument_combos = [ - dict(zip(self.model_params, val)) - for val in itertools.product(*self.model_params.values()) + dict(zip(self.model_params, val)) for val in itertools.product(*self.model_params.values()) ] self.call_argument_combos = [ - dict(zip(self.call_params, val)) - for val in itertools.product(*self.call_params.values()) + dict(zip(self.call_params, val)) for val in itertools.product(*self.call_params.values()) ] def llama_completion_fn( @@ -165,7 +163,7 @@ def _extract_responses(output: Dict[str, object]) -> list[str]: return [choice["text"] for choice in output["choices"]] def _get_model_names(self): - return [os.path.basename(combo['model_path']) for combo in self.argument_combos] - + return [os.path.basename(combo["model_path"]) for combo in self.argument_combos] + def _get_prompts(self): - return [combo['prompt'] for combo in self.argument_combos] \ No newline at end of file + return [combo["prompt"] for combo in self.argument_combos] diff --git a/prompttools/experiment/experiments/openai_chat_experiment.py b/prompttools/experiment/experiments/openai_chat_experiment.py index 49d76bf1..68605251 100644 --- a/prompttools/experiment/experiments/openai_chat_experiment.py +++ b/prompttools/experiment/experiments/openai_chat_experiment.py @@ -42,7 +42,9 @@ def __init__( # messages, as well as create prompt_keys to map the messages # to corresponding prompts in other models. if isinstance(messages[0], PromptSelector): - self.prompt_keys = {str(selector.for_openai_chat()[-1]["content"]): selector.for_llama() for selector in messages} + self.prompt_keys = { + str(selector.for_openai_chat()[-1]["content"]): selector.for_llama() for selector in messages + } messages = [selector.for_openai_chat() for selector in messages] else: self.prompt_keys = messages @@ -70,7 +72,7 @@ def _is_chat(self): return True def _get_model_names(self): - return [combo['model'] for combo in self.argument_combos] - + return [combo["model"] for combo in self.argument_combos] + def _get_prompts(self): - return [self.prompt_keys[str(combo['messages'][-1]["content"])] for combo in self.argument_combos] \ No newline at end of file + return [self.prompt_keys[str(combo["messages"][-1]["content"])] for combo in self.argument_combos] diff --git a/prompttools/experiment/experiments/openai_completion_experiment.py b/prompttools/experiment/experiments/openai_completion_experiment.py index 8c5029c7..b24cbed9 100644 --- a/prompttools/experiment/experiments/openai_completion_experiment.py +++ b/prompttools/experiment/experiments/openai_completion_experiment.py @@ -64,4 +64,4 @@ def _extract_responses(output: Dict[str, object]) -> list[str]: return [choice["text"] for choice in output["choices"]] def _get_model_names(self): - return [combo['model'] for combo in self.argument_combos] \ No newline at end of file + return [combo["model"] for combo in self.argument_combos] diff --git a/prompttools/experiment/widgets/comparison.py b/prompttools/experiment/widgets/comparison.py index 51c6f636..8ed96adf 100644 --- a/prompttools/experiment/widgets/comparison.py +++ b/prompttools/experiment/widgets/comparison.py @@ -21,9 +21,7 @@ def __init__(self, completion_fn, agg_fn, eval_listener_fn): self.agg_fn = agg_fn self.eval_listener_fn = eval_listener_fn - def _get_comparison_submission_listener( - self, table: pd.DataFrame, models: List[str] - ) -> Callable: + def _get_comparison_submission_listener(self, table: pd.DataFrame, models: List[str]) -> Callable: def on_click(b): sorted_scores = self.agg_fn(table, 0) data = { @@ -40,24 +38,12 @@ def set_models(self, models: List[str]) -> None: self.row_len = 2 + len(self.models) def get_header_widgets(self) -> List[object]: - return ( - [widgets.Label("Input")] - + [widgets.Label(model) for model in self.models] - + [widgets.Label("Feedback")] - ) + return [widgets.Label("Input")] + [widgets.Label(model) for model in self.models] + [widgets.Label("Feedback")] def get_row_widgets(self, index, row): - items = [ - widgets.HTML( - value="

" + row.name + "

" - ) - ] + items = [widgets.HTML(value="

" + row.name + "

")] items += [ - widgets.HTML( - value="

" - + row[model][0] - + "

" - ) + widgets.HTML(value="

" + row[model][0] + "

") for model in self.models ] feedback_dropdown = widgets.Dropdown( @@ -76,17 +62,13 @@ def get_footer_widgets(self, table): button_style="success", tooltip="Submit", ) - submit_button.on_click( - self._get_comparison_submission_listener(table, self.models) - ) + submit_button.on_click(self._get_comparison_submission_listener(table, self.models)) return [widgets.Label("")] * (self.row_len - 1) + [submit_button] def display(self, items): row_len = 2 + len(self.models) grid = widgets.GridBox( items, - layout=widgets.Layout( - grid_template_columns="repeat(" + str(row_len) + ", 230px)" - ), + layout=widgets.Layout(grid_template_columns="repeat(" + str(row_len) + ", 230px)"), ) display.display(grid) diff --git a/prompttools/experiment/widgets/feedback.py b/prompttools/experiment/widgets/feedback.py index 22f70b75..750917d8 100644 --- a/prompttools/experiment/widgets/feedback.py +++ b/prompttools/experiment/widgets/feedback.py @@ -21,9 +21,7 @@ def __init__(self, completion_fn, agg_fn, eval_listener_fn): self.agg_fn = agg_fn self.eval_listener_fn = eval_listener_fn - def _get_feedback_submission_listener( - self, table: pd.DataFrame, pivot_columns: List[str] - ) -> Callable: + def _get_feedback_submission_listener(self, table: pd.DataFrame, pivot_columns: List[str]) -> Callable: def on_click(b): sorted_scores = self.agg_fn(table, "feedback", pivot_columns[0]) data = { @@ -48,21 +46,9 @@ def get_header_widgets(self) -> List[object]: def get_row_widgets(self, index, row): items = [ - widgets.HTML( - value="

" - + row[self.pivot_columns[0]] - + "

" - ), - widgets.HTML( - value="

" - + row[self.pivot_columns[1]] - + "

" - ), - widgets.HTML( - value="

" - + ", ".join(row["response(s)"]) - + "

" - ), + widgets.HTML(value="

" + row[self.pivot_columns[0]] + "

"), + widgets.HTML(value="

" + row[self.pivot_columns[1]] + "

"), + widgets.HTML(value="

" + ", ".join(row["response(s)"]) + "

"), ] feedback_dropdown = widgets.Dropdown( options=[("\U0001F44D", 1), ("\U0001F44E", 0)], @@ -80,9 +66,7 @@ def get_footer_widgets(self, table): button_style="success", tooltip="Submit", ) - submit_button.on_click( - self._get_feedback_submission_listener(table, self.pivot_columns) - ) + submit_button.on_click(self._get_feedback_submission_listener(table, self.pivot_columns)) return [ widgets.Label(""), widgets.Label(""), diff --git a/prompttools/harness/harness.py b/prompttools/harness/harness.py index bbb30278..ef824574 100644 --- a/prompttools/harness/harness.py +++ b/prompttools/harness/harness.py @@ -37,9 +37,7 @@ def run(self) -> None: """ self.experiment.run(runs=self.runs) - def evaluate( - self, metric_name: str, eval_fn: Callable, use_input_pairs: bool = True - ) -> None: + def evaluate(self, metric_name: str, eval_fn: Callable, use_input_pairs: bool = True) -> None: r""" Uses the given eval_fn to evaluate the results of the underlying experiment. """ @@ -65,6 +63,4 @@ def rank(self, metric_name: str, is_average: bool = False) -> Dict[str, float]: Scores and ranks the experiment inputs using the pivot columns, e.g. prompt templates or system prompts. """ - return self.experiment.rank( - self.input_pairs_dict, self.PIVOT_COLUMNS, metric_name, is_average - ) + return self.experiment.rank(self.input_pairs_dict, self.PIVOT_COLUMNS, metric_name, is_average) diff --git a/prompttools/harness/multi_experiment_harness.py b/prompttools/harness/multi_experiment_harness.py index 6fa33117..80aa0ce9 100644 --- a/prompttools/harness/multi_experiment_harness.py +++ b/prompttools/harness/multi_experiment_harness.py @@ -3,7 +3,8 @@ from prompttools.experiment import Experiment import pandas as pd -class MultiExperimentHarness(): + +class MultiExperimentHarness: def __init__(self, experiments: List[Experiment], prompts: List[str] = []): self.experiments = experiments self.prompts = prompts @@ -16,9 +17,7 @@ def run(self): for experiment in self.experiments: experiment.run() - def evaluate( - self, metric_name: str, eval_fn: Callable - ) -> None: + def evaluate(self, metric_name: str, eval_fn: Callable) -> None: for experiment in self.experiments: experiment.evaluate(metric_name, eval_fn) @@ -28,13 +27,15 @@ def gather_feedback(self) -> None: def _get_argument_combos(self): tmp = [combo for experiment in self.experiments for combo in experiment.argument_combos] return tmp - + def _get_prompts(self): tmp = [combo for experiment in self.experiments for combo in experiment._get_prompts()] return tmp def _get_results(self): - tmp = [experiment._extract_responses(result) for experiment in self.experiments for result in experiment.results] + tmp = [ + experiment._extract_responses(result) for experiment in self.experiments for result in experiment.results + ] return tmp def _get_scores(self): @@ -43,19 +44,19 @@ def _get_scores(self): for name, score in experiment.scores.items(): scores[name].extend(score) return scores - + def _get_experiment_names(self): tmp = [name for experiment in self.experiments for name in experiment._get_model_names()] - return tmp + return tmp def visualize(self, colname: str = None) -> None: argument_combos = self._get_argument_combos() scores = self._get_scores() data = { - 'prompt': self._get_prompts(), + "prompt": self._get_prompts(), "response(s)": self._get_results(), "latency": scores["latency"], - "model": self._get_experiment_names() + "model": self._get_experiment_names(), } # Add scores for each eval fn, including feedback for metric_name, evals in scores.items(): @@ -66,11 +67,11 @@ def visualize(self, colname: str = None) -> None: df = pd.pivot_table( df, values=colname, - index=['prompt'], - columns=['model'], + index=["prompt"], + columns=["model"], aggfunc=lambda x: x.iloc[0], ) return df def rank(self, metric_name: str, is_average: bool = False) -> Dict[str, float]: - pass \ No newline at end of file + pass diff --git a/prompttools/mock/mock.py b/prompttools/mock/mock.py index 79056243..76d98a22 100644 --- a/prompttools/mock/mock.py +++ b/prompttools/mock/mock.py @@ -53,9 +53,9 @@ def mock_hf_completion_fn(**kwargs): def mock_chromadb_fn(**kwargs): return { - 'ids': [['id1']], - 'embeddings': None, - 'documents': [['George Washington lived in modern day Philadelphia']], - 'metadatas': [[{'source': 'my_source'}]], - 'distances': [[0.5932742953300476]] + "ids": [["id1"]], + "embeddings": None, + "documents": [["George Washington lived in modern day Philadelphia"]], + "metadatas": [[{"source": "my_source"}]], + "distances": [[0.5932742953300476]], } diff --git a/prompttools/prompttest/runner/prompt_template_runner.py b/prompttools/prompttest/runner/prompt_template_runner.py index ce043d14..6dc3b757 100644 --- a/prompttools/prompttest/runner/prompt_template_runner.py +++ b/prompttools/prompttest/runner/prompt_template_runner.py @@ -26,16 +26,11 @@ def __init__(self): self.user_inputs = {} super().__init__() - def read( - self, prompt_template_file: str, user_input_file: str - ) -> Tuple[str, List[Dict[str, str]]]: + def read(self, prompt_template_file: str, user_input_file: str) -> Tuple[str, List[Dict[str, str]]]: """ Reads data from files and parses it into a prompt template and user input. """ - if ( - prompt_template_file in self.prompt_templates - and user_input_file in self.user_inputs - ): + if prompt_template_file in self.prompt_templates and user_input_file in self.user_inputs: return ( self.prompt_templates[prompt_template_file], self.user_inputs[user_input_file], @@ -86,20 +81,13 @@ def run_prompt_template_test( """ Runs the prompt test. """ - key = prompt_template_test_runner.run( - experiment, model_name, prompt_template, user_inputs, model_args - ) + key = prompt_template_test_runner.run(experiment, model_name, prompt_template, user_inputs, model_args) prompt_template_test_runner.evaluate(key, metric_name, eval_fn, use_input_pairs) scored_template = prompt_template_test_runner.rank(key, metric_name, is_average) if not scored_template: - logging.error( - "Something went wrong during testing. Make sure your API keys are set correctly." - ) + logging.error("Something went wrong during testing. Make sure your API keys are set correctly.") raise PromptTestSetupException - if ( - scored_template[prompt_template] < threshold - and threshold_type is ThresholdType.MINIMUM - ): + if scored_template[prompt_template] < threshold and threshold_type is ThresholdType.MINIMUM: log_failure( metric_name, threshold, @@ -107,10 +95,7 @@ def run_prompt_template_test( threshold_type=threshold_type, ) return 1 - if ( - scored_template[prompt_template] > threshold - and threshold_type is ThresholdType.MAXIMUM - ): + if scored_template[prompt_template] > threshold and threshold_type is ThresholdType.MAXIMUM: log_failure( metric_name, threshold, @@ -137,9 +122,7 @@ def run_prompt_template_test_from_files( """ Reads data in from files and runs the prompt test. """ - prompt_template, user_inputs = prompt_template_test_runner.read( - prompt_template_file, user_input_file - ) + prompt_template, user_inputs = prompt_template_test_runner.read(prompt_template_file, user_input_file) return run_prompt_template_test( experiment, model_name, diff --git a/prompttools/prompttest/runner/runner.py b/prompttools/prompttest/runner/runner.py index c7c5b91b..ef8f4a5c 100644 --- a/prompttools/prompttest/runner/runner.py +++ b/prompttools/prompttest/runner/runner.py @@ -57,6 +57,4 @@ def _get_harness( user_inputs: List[Dict[str, str]], model_args: Dict[str, object], ): - raise NotImplementedError( - "This should be implemented by a subclass of `PromptTestRunner`." - ) + raise NotImplementedError("This should be implemented by a subclass of `PromptTestRunner`.") diff --git a/prompttools/prompttest/runner/system_prompt_runner.py b/prompttools/prompttest/runner/system_prompt_runner.py index 44af064b..33ecc8f2 100644 --- a/prompttools/prompttest/runner/system_prompt_runner.py +++ b/prompttools/prompttest/runner/system_prompt_runner.py @@ -26,16 +26,11 @@ def __init__(self): self.human_messages = {} super().__init__() - def read( - self, system_prompt_file: str, human_messages_file: str - ) -> Tuple[str, List[List[str]]]: + def read(self, system_prompt_file: str, human_messages_file: str) -> Tuple[str, List[List[str]]]: r""" Reads data from files and parses it into a system prompt and human messages. """ - if ( - system_prompt_file in self.system_prompts - and human_messages_file in self.human_messages - ): + if system_prompt_file in self.system_prompts and human_messages_file in self.human_messages: return ( self.system_prompts[system_prompt_file], self.human_messages[human_messages_file], @@ -84,20 +79,13 @@ def run_system_prompt_test( r""" Runs the prompt test. """ - key = system_prompt_test_runner.run( - experiment, model_name, system_prompt, human_messages, model_args - ) + key = system_prompt_test_runner.run(experiment, model_name, system_prompt, human_messages, model_args) system_prompt_test_runner.evaluate(key, metric_name, eval_fn, use_input_pairs) scored_template = system_prompt_test_runner.rank(key, metric_name, is_average) if not scored_template: - logging.error( - "Something went wrong during testing. Make sure your API keys are set correctly." - ) + logging.error("Something went wrong during testing. Make sure your API keys are set correctly.") raise PromptTestSetupException - if ( - scored_template[system_prompt] < threshold - and threshold_type is ThresholdType.MINIMUM - ): + if scored_template[system_prompt] < threshold and threshold_type is ThresholdType.MINIMUM: log_failure( metric_name, threshold, @@ -105,10 +93,7 @@ def run_system_prompt_test( threshold_type=threshold_type, ) return 1 - if ( - scored_template[system_prompt] > threshold - and threshold_type is ThresholdType.MAXIMUM - ): + if scored_template[system_prompt] > threshold and threshold_type is ThresholdType.MAXIMUM: log_failure( metric_name, threshold, @@ -135,9 +120,7 @@ def run_system_prompt_test_from_files( r""" Reads data in from files and runs the prompt test. """ - system_prompt, human_messages = system_prompt_test_runner.read( - system_prompt_file, human_messages_file - ) + system_prompt, human_messages = system_prompt_test_runner.read(system_prompt_file, human_messages_file) return run_system_prompt_test( experiment, model_name, diff --git a/prompttools/requests/request_queue.py b/prompttools/requests/request_queue.py index 338600ce..188165e6 100644 --- a/prompttools/requests/request_queue.py +++ b/prompttools/requests/request_queue.py @@ -47,9 +47,7 @@ def _do_task(self, fn: Callable, args: Dict[str, object]) -> None: logging.error("Authentication error. Skipping request.") @retry_decorator - def _run( - self, fn: Callable, args: Dict[str, object] - ) -> Tuple[Dict[str, object], float]: + def _run(self, fn: Callable, args: Dict[str, object]) -> Tuple[Dict[str, object], float]: start = perf_counter() result = fn(**args) return result, perf_counter() - start diff --git a/prompttools/requests/retries.py b/prompttools/requests/retries.py index 47e18643..27bf52a3 100644 --- a/prompttools/requests/retries.py +++ b/prompttools/requests/retries.py @@ -15,9 +15,7 @@ import logging -def generate_retry_decorator( - wait_lower_bound: int = 3, wait_upper_bound: int = 12, max_retry_attempts: int = 5 -): +def generate_retry_decorator(wait_lower_bound: int = 3, wait_upper_bound: int = 12, max_retry_attempts: int = 5): r""" Creates a retry decorator that can be used for requests. It looks for specific exceptions and waits for certain about of time before retrying. This improves the reliability of the request queue. diff --git a/prompttools/selector/prompt_selector.py b/prompttools/selector/prompt_selector.py index 995c154d..92e11a1a 100644 --- a/prompttools/selector/prompt_selector.py +++ b/prompttools/selector/prompt_selector.py @@ -14,23 +14,23 @@ ### OUTPUT """ + class PromptSelector: r""" - An abstraction for rendering the same prompt - for different models, e.g. OpenAI Chat models + An abstraction for rendering the same prompt + for different models, e.g. OpenAI Chat models and Llama models """ + def __init__(self, instruction: str, user_input: str): self.instruction = instruction self.user_input = user_input - + def for_openai_chat(self): return [ {"role": "system", "content": self.instruction}, {"role": "user", "content": self.user_input}, ] - + def for_llama(self): - return TEMPLATE.format(instruction=self.instruction, - user_input=self.user_input) - \ No newline at end of file + return TEMPLATE.format(instruction=self.instruction, user_input=self.user_input) diff --git a/prompttools/utils/autoeval.py b/prompttools/utils/autoeval.py index 14ea2e7e..58f4aba7 100644 --- a/prompttools/utils/autoeval.py +++ b/prompttools/utils/autoeval.py @@ -46,7 +46,5 @@ def compute(prompt: str, response: str, model: str = "gpt-4") -> float: """ if not os.environ["OPENAI_API_KEY"]: raise PromptToolsUtilityError - evaluation = openai.ChatCompletion.create( - model=model, messages=_get_messages(prompt, response) - ) + evaluation = openai.ChatCompletion.create(model=model, messages=_get_messages(prompt, response)) return 1.0 if "RIGHT" in evaluation["choices"][0]["message"]["content"] else 0.0 diff --git a/prompttools/utils/similarity.py b/prompttools/utils/similarity.py index 9ce3fac4..e54b15b1 100644 --- a/prompttools/utils/similarity.py +++ b/prompttools/utils/similarity.py @@ -16,9 +16,7 @@ def _get_embedding_model(): if len(EMBEDDING_MODEL) == 0: from sentence_transformers import SentenceTransformer - EMBEDDING_MODEL.append( - SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") - ) + EMBEDDING_MODEL.append(SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")) return EMBEDDING_MODEL[0] diff --git a/setup.py b/setup.py index 51abf432..a6503dfb 100644 --- a/setup.py +++ b/setup.py @@ -31,11 +31,7 @@ def _get_version(): sha = "Unknown" try: - sha = ( - subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=str(ROOT_DIR)) - .decode("ascii") - .strip() - ) + sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=str(ROOT_DIR)).decode("ascii").strip() except Exception: pass From 02fd48e3a9c2232c7edadfb91542ca6f71a794ce Mon Sep 17 00:00:00 2001 From: steven krawczyk Date: Thu, 20 Jul 2023 20:43:46 -0700 Subject: [PATCH 06/10] init file --- prompttools/selector/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 prompttools/selector/__init__.py diff --git a/prompttools/selector/__init__.py b/prompttools/selector/__init__.py new file mode 100644 index 00000000..4d3d5903 --- /dev/null +++ b/prompttools/selector/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Hegel AI, Inc. +# All rights reserved. +# +# This source code's license can be found in the +# LICENSE file in the root directory of this source tree. From 339ba1c2b05e24b516ec7d6c21700cae70fd59e2 Mon Sep 17 00:00:00 2001 From: steven krawczyk Date: Thu, 20 Jul 2023 20:45:37 -0700 Subject: [PATCH 07/10] Pathname --- examples/notebooks/GPT4vsLlama2.ipynb | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/examples/notebooks/GPT4vsLlama2.ipynb b/examples/notebooks/GPT4vsLlama2.ipynb index 471e97e3..f99fb506 100644 --- a/examples/notebooks/GPT4vsLlama2.ipynb +++ b/examples/notebooks/GPT4vsLlama2.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "0a13ddc8", "metadata": {}, @@ -9,6 +10,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "780dc3bf", "metadata": {}, @@ -19,6 +21,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "623f0cfe", "metadata": {}, @@ -27,6 +30,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "52881369", "metadata": {}, @@ -45,6 +49,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "2eac35f8", "metadata": {}, @@ -53,6 +58,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "5edba05a", "metadata": {}, @@ -73,6 +79,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "842f1e47", "metadata": {}, @@ -95,6 +102,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "622dea9a", "metadata": {}, @@ -103,6 +111,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "0cd0bae8", "metadata": {}, @@ -143,6 +152,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "3babfe5a", "metadata": {}, @@ -157,7 +167,7 @@ "metadata": {}, "outputs": [], "source": [ - "model_paths = ['/Users/stevenkrawczyk/Downloads/llama-2-7b-chat.ggmlv3.q2_K.bin'] # Download from https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/tree/main\n", + "model_paths = ['/your/path/to/llama-2-7b-chat.ggmlv3.q2_K.bin'] # Download from https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/tree/main\n", "temperatures = [1.0]\n", "call_params = dict(temperature=temperatures)\n", "llama_experiment = LlamaCppExperiment(model_paths, selectors, call_params=call_params)" @@ -176,6 +186,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "6c3162e6", "metadata": {}, @@ -207,6 +218,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "2ceb662a", "metadata": {}, @@ -246,6 +258,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "974d6065", "metadata": {}, From 8be724e8d5ac57da78e83a0c223616e518716be4 Mon Sep 17 00:00:00 2001 From: Kevin Tse Date: Fri, 21 Jul 2023 08:45:02 -0700 Subject: [PATCH 08/10] Update examples/notebooks/GPT4vsLlama2.ipynb --- examples/notebooks/GPT4vsLlama2.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/notebooks/GPT4vsLlama2.ipynb b/examples/notebooks/GPT4vsLlama2.ipynb index f99fb506..d68c5d93 100644 --- a/examples/notebooks/GPT4vsLlama2.ipynb +++ b/examples/notebooks/GPT4vsLlama2.ipynb @@ -148,7 +148,7 @@ "\"\"\"\n", "Is 7 a prime number?\n", "\"\"\"]\n", - "selectors = [PromptSelector(instructions[i], inputs[i]) for i in range(3)]" + "selectors = [PromptSelector(instructions[i], inputs[i]) for i in range(len(inputs))]" ] }, { From 00556875f7a6d81e237f419424cd0293e218fde4 Mon Sep 17 00:00:00 2001 From: steven krawczyk Date: Fri, 21 Jul 2023 10:08:45 -0700 Subject: [PATCH 09/10] Address comments --- examples/notebooks/GPT4vsLlama2.ipynb | 438 ++++++++++++++++-- .../experiment/experiments/experiment.py | 2 +- 2 files changed, 408 insertions(+), 32 deletions(-) diff --git a/examples/notebooks/GPT4vsLlama2.ipynb b/examples/notebooks/GPT4vsLlama2.ipynb index d68c5d93..e6ac50cc 100644 --- a/examples/notebooks/GPT4vsLlama2.ipynb +++ b/examples/notebooks/GPT4vsLlama2.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "0a13ddc8", "metadata": {}, @@ -10,7 +9,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "780dc3bf", "metadata": {}, @@ -21,7 +19,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "623f0cfe", "metadata": {}, @@ -30,7 +27,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "52881369", "metadata": {}, @@ -40,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "885dabeb", "metadata": {}, "outputs": [], @@ -49,7 +45,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "2eac35f8", "metadata": {}, @@ -58,7 +53,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "5edba05a", "metadata": {}, @@ -68,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 13, "id": "ed4e635e", "metadata": {}, "outputs": [], @@ -79,7 +73,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "842f1e47", "metadata": {}, @@ -89,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 3, "id": "beaa70a1", "metadata": {}, "outputs": [], @@ -102,7 +95,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "622dea9a", "metadata": {}, @@ -111,7 +103,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "0cd0bae8", "metadata": {}, @@ -121,7 +112,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "2788d49f", "metadata": {}, "outputs": [], @@ -132,11 +123,15 @@ "\"\"\",\n", "\"\"\"\n", "You are a customer support representative for a startup called Hegel AI.\n", + "For context, Hegel AI does not offer refunds.\n", "Answer the following customer question:\n", "\"\"\", \n", "\"\"\"\n", "You are a helpful math tutor.\n", "Answer the following math problem:\n", + "\"\"\", \n", + "\"\"\"\n", + "Based on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\n", "\"\"\"]\n", "inputs = [\"\"\"\n", "Draft a short sales email, 50 words or less, asking a prospect for 15 minutes\n", @@ -146,13 +141,15 @@ "Do you offer refunds?\n", "\"\"\",\n", "\"\"\"\n", - "Is 7 a prime number?\n", + "Is 147 a prime number?\n", + "\"\"\",\n", + "\"\"\"\n", + "The speed of the car was 45mph\n", "\"\"\"]\n", "selectors = [PromptSelector(instructions[i], inputs[i]) for i in range(len(inputs))]" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "3babfe5a", "metadata": {}, @@ -162,12 +159,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "9114cfbf", "metadata": {}, "outputs": [], "source": [ - "model_paths = ['/your/path/to/llama-2-7b-chat.ggmlv3.q2_K.bin'] # Download from https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/tree/main\n", + "model_paths = ['/Users/stevenkrawczyk/Downloads/llama-2-7b-chat.ggmlv3.q2_K.bin'] # Download from https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/tree/main\n", "temperatures = [1.0]\n", "call_params = dict(temperature=temperatures)\n", "llama_experiment = LlamaCppExperiment(model_paths, selectors, call_params=call_params)" @@ -175,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "8fe83830", "metadata": {}, "outputs": [], @@ -186,17 +183,16 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "6c3162e6", "metadata": {}, "source": [ - "After that - we define our harness to run experiments" + "After that - we define our harness to run experiments. The `MultiExperimentHarness` is designed to run experiments across multiple model providers. Because the underlying APIs for LlamaCpp and OpenAI are different, we need a way to manage that complexity. So, we can use this harness to run experiments for different providers, and combine the results into a single table." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "9147649a", "metadata": {}, "outputs": [], @@ -206,19 +202,68 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "0f22ebd7", "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama.cpp: loading model from /Users/stevenkrawczyk/Downloads/llama-2-7b-chat.ggmlv3.q2_K.bin\n", + "llama_model_load_internal: format = ggjt v3 (latest)\n", + "llama_model_load_internal: n_vocab = 32000\n", + "llama_model_load_internal: n_ctx = 512\n", + "llama_model_load_internal: n_embd = 4096\n", + "llama_model_load_internal: n_mult = 256\n", + "llama_model_load_internal: n_head = 32\n", + "llama_model_load_internal: n_layer = 32\n", + "llama_model_load_internal: n_rot = 128\n", + "llama_model_load_internal: ftype = 10 (mostly Q2_K)\n", + "llama_model_load_internal: n_ff = 11008\n", + "llama_model_load_internal: model size = 7B\n", + "llama_model_load_internal: ggml ctx size = 0.08 MB\n", + "llama_model_load_internal: mem required = 4525.65 MB (+ 1026.00 MB per state)\n", + "llama_new_context_with_model: kv self size = 256.00 MB\n", + "AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | VSX = 0 | \n", + "\n", + "llama_print_timings: load time = 12348.40 ms\n", + "llama_print_timings: sample time = 73.65 ms / 88 runs ( 0.84 ms per token, 1194.79 tokens per second)\n", + "llama_print_timings: prompt eval time = 12348.33 ms / 93 tokens ( 132.78 ms per token, 7.53 tokens per second)\n", + "llama_print_timings: eval time = 13102.41 ms / 87 runs ( 150.60 ms per token, 6.64 tokens per second)\n", + "llama_print_timings: total time = 25769.96 ms\n", + "Llama.generate: prefix-match hit\n", + "\n", + "llama_print_timings: load time = 12348.40 ms\n", + "llama_print_timings: sample time = 87.64 ms / 102 runs ( 0.86 ms per token, 1163.81 tokens per second)\n", + "llama_print_timings: prompt eval time = 11123.06 ms / 60 tokens ( 185.38 ms per token, 5.39 tokens per second)\n", + "llama_print_timings: eval time = 14914.47 ms / 101 runs ( 147.67 ms per token, 6.77 tokens per second)\n", + "llama_print_timings: total time = 26406.45 ms\n", + "Llama.generate: prefix-match hit\n", + "\n", + "llama_print_timings: load time = 12348.40 ms\n", + "llama_print_timings: sample time = 51.35 ms / 62 runs ( 0.83 ms per token, 1207.33 tokens per second)\n", + "llama_print_timings: prompt eval time = 9665.15 ms / 38 tokens ( 254.35 ms per token, 3.93 tokens per second)\n", + "llama_print_timings: eval time = 7996.12 ms / 61 runs ( 131.08 ms per token, 7.63 tokens per second)\n", + "llama_print_timings: total time = 17877.24 ms\n", + "Llama.generate: prefix-match hit\n", + "\n", + "llama_print_timings: load time = 12348.40 ms\n", + "llama_print_timings: sample time = 106.40 ms / 128 runs ( 0.83 ms per token, 1202.96 tokens per second)\n", + "llama_print_timings: prompt eval time = 9926.22 ms / 53 tokens ( 187.29 ms per token, 5.34 tokens per second)\n", + "llama_print_timings: eval time = 17259.87 ms / 127 runs ( 135.90 ms per token, 7.36 tokens per second)\n", + "llama_print_timings: total time = 27638.57 ms\n" + ] + } + ], "source": [ "harness.prepare()\n", "harness.run()" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "2ceb662a", "metadata": {}, @@ -228,7 +273,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "8ddbb951", "metadata": {}, "outputs": [], @@ -258,7 +303,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "974d6065", "metadata": {}, @@ -268,7 +312,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "e80dfeec", "metadata": { "scrolled": true @@ -280,22 +324,354 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "4d09c18e", "metadata": { "scrolled": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
promptresponse(s)latencymodelauto-evaluation
0\\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n[Subject: Boost Your Large Language Model Efficiency?\\n\\nHi [Prospect's Name],\\n\\nAre you maximizing the potential of large language models? Let's chat for 15 minutes to explore how Hegel AI's developer tools can elevate your team's efficiency and productivity. \\n\\nLooking forward to your response!\\n\\nBest,\\n[Your Name]]3.025437gpt-4-03141.0
1\\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n[Thank you for reaching out to Hegel AI customer support. We understand that you may have concerns about refunds. However, as per our company policy, we do not offer refunds. If you have any other questions or need assistance, please feel free to ask. We're here to help!]4.394606gpt-4-03141.0
2\\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n[No, 147 is not a prime number. It can be factored into 3 x 49, which means it has divisors other than 1 and itself.]1.701209gpt-4-03141.0
3\\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n[{\\n \"metric_name\": \"car_speed\",\\n \"value\": 45\\n}]1.380243gpt-4-03141.0
4\\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n[Subject: Enhance Your Large Language Models with Hegel AI\\n\\nHi [Prospect's Name],\\n\\nI'm reaching out from Hegel AI. We've developed tools that could significantly improve your work with large language models. Could we schedule a quick 15-minute chat to discuss how we can help streamline your processes?\\n\\nBest,\\n[Your Name]]6.235348gpt-4-06131.0
5\\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n[No, I'm sorry but Hegel AI does not offer refunds.]2.049971gpt-4-06131.0
6\\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n[No, 147 is not a prime number. A prime number is a number that has only two distinct positive divisors: 1 and itself. However, 147 can be divided evenly by 1, 3, 7, 21, 49, and 147, so it does not meet the criteria.]4.651636gpt-4-06131.0
7\\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n[\\n{\\n \"metric_name\": \"car_speed\",\\n \"value\": \"45mph\"\\n}]1.865187gpt-4-06131.0
8\\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n[Subject: Enhance Your Large Language Models with Hegel AI\\n\\nHi [Prospect's Name],\\n\\nI'm reaching out from Hegel AI. We've developed tools that could significantly improve your work with large language models. Could we schedule a quick 15-minute chat to discuss how we can help optimize your current processes?\\n\\nBest,\\n[Your Name]]5.846508gpt-41.0
9\\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n[No, I'm sorry, but Hegel AI does not offer refunds.]2.072807gpt-41.0
10\\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n[No, 147 is not a prime number. A prime number is a number that has only two distinct positive divisors: 1 and itself. However, 147 can be divided evenly by 1, 3, 7, 21, 49, and 147, so it does not meet the criteria.]4.129878gpt-41.0
11\\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n[\\n{\\n \"metric_name\": \"car_speed\",\\n \"value\": \"45mph\"\\n}]1.746500gpt-41.0
12\\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n[\\nHi [Prospect Name],\\n\\nI hope you're doing well! I'm reaching out from Hegel AI. I see your organization is leveraging large language models in exciting ways - can I ask a few questions to better understand how these models are impacting your work? Can you spare 15 minutes next week for us to chat? \\n\\nBest,\\n[Your Name]]25.770857llama-2-7b-chat.ggmlv3.q2_K.bin1.0
13\\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n[\\nSorry to hear that you're not satisfied with our product. However, we don't offer refunds as it goes against our company policy and is stated in our terms of service. I apologize but there are no exceptions made for customers who wish they had a refund. It's important to understand that once you make a purchase, it's final and non-refundable.\\nIf you have any other questions or concerns, please feel free to ask!]26.407377llama-2-7b-chat.ggmlv3.q2_K.bin1.0
14\\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n[\\nYes, 147 is a prime number.\\n\\n### EXPLANATION\\n\\nA prime number is a positive integer that is divisible only by itself and 1. Since 147 is divisible only by 1 and itself, it is a prime number.]17.878741llama-2-7b-chat.ggmlv3.q2_K.bin0.0
15\\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n[\\n\\n{\\n\"metric_name\": \"speed\",\\n\"value\": 45\\n}\\n\\nThe time taken to complete the task is 20 minutes\\n\\n\\n### OUTPUT\\n\\n\\n{\\n\"metric_name\": \"time_taken\",\\n\"value\": 20\\n}\\n\\nThe distance of the car is 200 miles\\n\\n\\n### OUTPUT\\n\\n\\n{\\n\"metric_name\": \"distance\",\\n\"value\": 200\\n}\\n\\nThe price of the car is $30,000\\n\\n\\n]27.639712llama-2-7b-chat.ggmlv3.q2_K.bin0.0
\n", + "
" + ], + "text/plain": [ + " prompt \n", + "0 \\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n \\\n", + "1 \\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n \n", + "2 \\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n \n", + "3 \\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n \n", + "4 \\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n \n", + "5 \\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n \n", + "6 \\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n \n", + "7 \\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n \n", + "8 \\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n \n", + "9 \\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n \n", + "10 \\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n \n", + "11 \\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n \n", + "12 \\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n \n", + "13 \\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n \n", + "14 \\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n \n", + "15 \\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n \n", + "\n", + " response(s) \n", + "0 [Subject: Boost Your Large Language Model Efficiency?\\n\\nHi [Prospect's Name],\\n\\nAre you maximizing the potential of large language models? Let's chat for 15 minutes to explore how Hegel AI's developer tools can elevate your team's efficiency and productivity. \\n\\nLooking forward to your response!\\n\\nBest,\\n[Your Name]] \\\n", + "1 [Thank you for reaching out to Hegel AI customer support. We understand that you may have concerns about refunds. However, as per our company policy, we do not offer refunds. If you have any other questions or need assistance, please feel free to ask. We're here to help!] \n", + "2 [No, 147 is not a prime number. It can be factored into 3 x 49, which means it has divisors other than 1 and itself.] \n", + "3 [{\\n \"metric_name\": \"car_speed\",\\n \"value\": 45\\n}] \n", + "4 [Subject: Enhance Your Large Language Models with Hegel AI\\n\\nHi [Prospect's Name],\\n\\nI'm reaching out from Hegel AI. We've developed tools that could significantly improve your work with large language models. Could we schedule a quick 15-minute chat to discuss how we can help streamline your processes?\\n\\nBest,\\n[Your Name]] \n", + "5 [No, I'm sorry but Hegel AI does not offer refunds.] \n", + "6 [No, 147 is not a prime number. A prime number is a number that has only two distinct positive divisors: 1 and itself. However, 147 can be divided evenly by 1, 3, 7, 21, 49, and 147, so it does not meet the criteria.] \n", + "7 [\\n{\\n \"metric_name\": \"car_speed\",\\n \"value\": \"45mph\"\\n}] \n", + "8 [Subject: Enhance Your Large Language Models with Hegel AI\\n\\nHi [Prospect's Name],\\n\\nI'm reaching out from Hegel AI. We've developed tools that could significantly improve your work with large language models. Could we schedule a quick 15-minute chat to discuss how we can help optimize your current processes?\\n\\nBest,\\n[Your Name]] \n", + "9 [No, I'm sorry, but Hegel AI does not offer refunds.] \n", + "10 [No, 147 is not a prime number. A prime number is a number that has only two distinct positive divisors: 1 and itself. However, 147 can be divided evenly by 1, 3, 7, 21, 49, and 147, so it does not meet the criteria.] \n", + "11 [\\n{\\n \"metric_name\": \"car_speed\",\\n \"value\": \"45mph\"\\n}] \n", + "12 [\\nHi [Prospect Name],\\n\\nI hope you're doing well! I'm reaching out from Hegel AI. I see your organization is leveraging large language models in exciting ways - can I ask a few questions to better understand how these models are impacting your work? Can you spare 15 minutes next week for us to chat? \\n\\nBest,\\n[Your Name]] \n", + "13 [\\nSorry to hear that you're not satisfied with our product. However, we don't offer refunds as it goes against our company policy and is stated in our terms of service. I apologize but there are no exceptions made for customers who wish they had a refund. It's important to understand that once you make a purchase, it's final and non-refundable.\\nIf you have any other questions or concerns, please feel free to ask!] \n", + "14 [\\nYes, 147 is a prime number.\\n\\n### EXPLANATION\\n\\nA prime number is a positive integer that is divisible only by itself and 1. Since 147 is divisible only by 1 and itself, it is a prime number.] \n", + "15 [\\n\\n{\\n\"metric_name\": \"speed\",\\n\"value\": 45\\n}\\n\\nThe time taken to complete the task is 20 minutes\\n\\n\\n### OUTPUT\\n\\n\\n{\\n\"metric_name\": \"time_taken\",\\n\"value\": 20\\n}\\n\\nThe distance of the car is 200 miles\\n\\n\\n### OUTPUT\\n\\n\\n{\\n\"metric_name\": \"distance\",\\n\"value\": 200\\n}\\n\\nThe price of the car is $30,000\\n\\n\\n] \n", + "\n", + " latency model auto-evaluation \n", + "0 3.025437 gpt-4-0314 1.0 \n", + "1 4.394606 gpt-4-0314 1.0 \n", + "2 1.701209 gpt-4-0314 1.0 \n", + "3 1.380243 gpt-4-0314 1.0 \n", + "4 6.235348 gpt-4-0613 1.0 \n", + "5 2.049971 gpt-4-0613 1.0 \n", + "6 4.651636 gpt-4-0613 1.0 \n", + "7 1.865187 gpt-4-0613 1.0 \n", + "8 5.846508 gpt-4 1.0 \n", + "9 2.072807 gpt-4 1.0 \n", + "10 4.129878 gpt-4 1.0 \n", + "11 1.746500 gpt-4 1.0 \n", + "12 25.770857 llama-2-7b-chat.ggmlv3.q2_K.bin 1.0 \n", + "13 26.407377 llama-2-7b-chat.ggmlv3.q2_K.bin 1.0 \n", + "14 17.878741 llama-2-7b-chat.ggmlv3.q2_K.bin 0.0 \n", + "15 27.639712 llama-2-7b-chat.ggmlv3.q2_K.bin 0.0 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "harness.visualize()" ] }, + { + "cell_type": "markdown", + "id": "36fa7a07", + "metadata": {}, + "source": [ + "If we want to pivot on a column like `\"response(s)\"`, we simply pass that column name into the visualize function." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "9ee8dc43", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
modelgpt-4gpt-4-0314gpt-4-0613llama-2-7b-chat.ggmlv3.q2_K.bin
prompt
\\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n[\\n{\\n \"metric_name\": \"car_speed\",\\n \"value\": \"45mph\"\\n}][{\\n \"metric_name\": \"car_speed\",\\n \"value\": 45\\n}][\\n{\\n \"metric_name\": \"car_speed\",\\n \"value\": \"45mph\"\\n}][\\n\\n{\\n\"metric_name\": \"speed\",\\n\"value\": 45\\n}\\n\\nThe time taken to complete the task is 20 minutes\\n\\n\\n### OUTPUT\\n\\n\\n{\\n\"metric_name\": \"time_taken\",\\n\"value\": 20\\n}\\n\\nThe distance of the car is 200 miles\\n\\n\\n### OUTPUT\\n\\n\\n{\\n\"metric_name\": \"distance\",\\n\"value\": 200\\n}\\n\\nThe price of the car is $30,000\\n\\n\\n]
\\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n[No, I'm sorry, but Hegel AI does not offer refunds.][Thank you for reaching out to Hegel AI customer support. We understand that you may have concerns about refunds. However, as per our company policy, we do not offer refunds. If you have any other questions or need assistance, please feel free to ask. We're here to help!][No, I'm sorry but Hegel AI does not offer refunds.][\\nSorry to hear that you're not satisfied with our product. However, we don't offer refunds as it goes against our company policy and is stated in our terms of service. I apologize but there are no exceptions made for customers who wish they had a refund. It's important to understand that once you make a purchase, it's final and non-refundable.\\nIf you have any other questions or concerns, please feel free to ask!]
\\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n[No, 147 is not a prime number. A prime number is a number that has only two distinct positive divisors: 1 and itself. However, 147 can be divided evenly by 1, 3, 7, 21, 49, and 147, so it does not meet the criteria.][No, 147 is not a prime number. It can be factored into 3 x 49, which means it has divisors other than 1 and itself.][No, 147 is not a prime number. A prime number is a number that has only two distinct positive divisors: 1 and itself. However, 147 can be divided evenly by 1, 3, 7, 21, 49, and 147, so it does not meet the criteria.][\\nYes, 147 is a prime number.\\n\\n### EXPLANATION\\n\\nA prime number is a positive integer that is divisible only by itself and 1. Since 147 is divisible only by 1 and itself, it is a prime number.]
\\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n[Subject: Enhance Your Large Language Models with Hegel AI\\n\\nHi [Prospect's Name],\\n\\nI'm reaching out from Hegel AI. We've developed tools that could significantly improve your work with large language models. Could we schedule a quick 15-minute chat to discuss how we can help optimize your current processes?\\n\\nBest,\\n[Your Name]][Subject: Boost Your Large Language Model Efficiency?\\n\\nHi [Prospect's Name],\\n\\nAre you maximizing the potential of large language models? Let's chat for 15 minutes to explore how Hegel AI's developer tools can elevate your team's efficiency and productivity. \\n\\nLooking forward to your response!\\n\\nBest,\\n[Your Name]][Subject: Enhance Your Large Language Models with Hegel AI\\n\\nHi [Prospect's Name],\\n\\nI'm reaching out from Hegel AI. We've developed tools that could significantly improve your work with large language models. Could we schedule a quick 15-minute chat to discuss how we can help streamline your processes?\\n\\nBest,\\n[Your Name]][\\nHi [Prospect Name],\\n\\nI hope you're doing well! I'm reaching out from Hegel AI. I see your organization is leveraging large language models in exciting ways - can I ask a few questions to better understand how these models are impacting your work? Can you spare 15 minutes next week for us to chat? \\n\\nBest,\\n[Your Name]]
\n", + "
" + ], + "text/plain": [ + "model gpt-4 \n", + "prompt \n", + "\\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n [\\n{\\n \"metric_name\": \"car_speed\",\\n \"value\": \"45mph\"\\n}] \\\n", + "\\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n [No, I'm sorry, but Hegel AI does not offer refunds.] \n", + "\\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n [No, 147 is not a prime number. A prime number is a number that has only two distinct positive divisors: 1 and itself. However, 147 can be divided evenly by 1, 3, 7, 21, 49, and 147, so it does not meet the criteria.] \n", + "\\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n [Subject: Enhance Your Large Language Models with Hegel AI\\n\\nHi [Prospect's Name],\\n\\nI'm reaching out from Hegel AI. We've developed tools that could significantly improve your work with large language models. Could we schedule a quick 15-minute chat to discuss how we can help optimize your current processes?\\n\\nBest,\\n[Your Name]] \n", + "\n", + "model gpt-4-0314 \n", + "prompt \n", + "\\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n [{\\n \"metric_name\": \"car_speed\",\\n \"value\": 45\\n}] \\\n", + "\\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n [Thank you for reaching out to Hegel AI customer support. We understand that you may have concerns about refunds. However, as per our company policy, we do not offer refunds. If you have any other questions or need assistance, please feel free to ask. We're here to help!] \n", + "\\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n [No, 147 is not a prime number. It can be factored into 3 x 49, which means it has divisors other than 1 and itself.] \n", + "\\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n [Subject: Boost Your Large Language Model Efficiency?\\n\\nHi [Prospect's Name],\\n\\nAre you maximizing the potential of large language models? Let's chat for 15 minutes to explore how Hegel AI's developer tools can elevate your team's efficiency and productivity. \\n\\nLooking forward to your response!\\n\\nBest,\\n[Your Name]] \n", + "\n", + "model gpt-4-0613 \n", + "prompt \n", + "\\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n [\\n{\\n \"metric_name\": \"car_speed\",\\n \"value\": \"45mph\"\\n}] \\\n", + "\\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n [No, I'm sorry but Hegel AI does not offer refunds.] \n", + "\\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n [No, 147 is not a prime number. A prime number is a number that has only two distinct positive divisors: 1 and itself. However, 147 can be divided evenly by 1, 3, 7, 21, 49, and 147, so it does not meet the criteria.] \n", + "\\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n [Subject: Enhance Your Large Language Models with Hegel AI\\n\\nHi [Prospect's Name],\\n\\nI'm reaching out from Hegel AI. We've developed tools that could significantly improve your work with large language models. Could we schedule a quick 15-minute chat to discuss how we can help streamline your processes?\\n\\nBest,\\n[Your Name]] \n", + "\n", + "model llama-2-7b-chat.ggmlv3.q2_K.bin \n", + "prompt \n", + "\\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n [\\n\\n{\\n\"metric_name\": \"speed\",\\n\"value\": 45\\n}\\n\\nThe time taken to complete the task is 20 minutes\\n\\n\\n### OUTPUT\\n\\n\\n{\\n\"metric_name\": \"time_taken\",\\n\"value\": 20\\n}\\n\\nThe distance of the car is 200 miles\\n\\n\\n### OUTPUT\\n\\n\\n{\\n\"metric_name\": \"distance\",\\n\"value\": 200\\n}\\n\\nThe price of the car is $30,000\\n\\n\\n] \n", + "\\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n [\\nSorry to hear that you're not satisfied with our product. However, we don't offer refunds as it goes against our company policy and is stated in our terms of service. I apologize but there are no exceptions made for customers who wish they had a refund. It's important to understand that once you make a purchase, it's final and non-refundable.\\nIf you have any other questions or concerns, please feel free to ask!] \n", + "\\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n [\\nYes, 147 is a prime number.\\n\\n### EXPLANATION\\n\\nA prime number is a positive integer that is divisible only by itself and 1. Since 147 is divisible only by 1 and itself, it is a prime number.] \n", + "\\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n [\\nHi [Prospect Name],\\n\\nI hope you're doing well! I'm reaching out from Hegel AI. I see your organization is leveraging large language models in exciting ways - can I ask a few questions to better understand how these models are impacting your work? Can you spare 15 minutes next week for us to chat? \\n\\nBest,\\n[Your Name]] " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "harness.visualize(\"response(s)\")" ] diff --git a/prompttools/experiment/experiments/experiment.py b/prompttools/experiment/experiments/experiment.py index a68c5682..8ec536f0 100644 --- a/prompttools/experiment/experiments/experiment.py +++ b/prompttools/experiment/experiments/experiment.py @@ -125,7 +125,7 @@ def run( self.queue.enqueue( self.completion_fn, # We need to filter out defaults that are invalid JSON from the request - {k: v for k, v in combo.items() if v and v != float("inf")}, + {k: v for k, v in combo.items() if (v != None) and (v != float("inf"))}, ) self.results = self.queue.results() self.scores["latency"] = self.queue.latencies() From 4ef5653084f6dfee25562610add91eba50d0de7f Mon Sep 17 00:00:00 2001 From: steven krawczyk Date: Fri, 21 Jul 2023 13:07:58 -0700 Subject: [PATCH 10/10] Update promptselector --- examples/notebooks/GPT4vsLlama2.ipynb | 367 ++++++++++++------------ prompttools/selector/prompt_selector.py | 9 +- 2 files changed, 187 insertions(+), 189 deletions(-) diff --git a/examples/notebooks/GPT4vsLlama2.ipynb b/examples/notebooks/GPT4vsLlama2.ipynb index e6ac50cc..b8c53ba4 100644 --- a/examples/notebooks/GPT4vsLlama2.ipynb +++ b/examples/notebooks/GPT4vsLlama2.ipynb @@ -62,13 +62,13 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 2, "id": "ed4e635e", "metadata": {}, "outputs": [], "source": [ "import os\n", - "os.environ['DEBUG'] = \"\"\n", + "os.environ['DEBUG'] = \"1\"\n", "os.environ['OPENAI_API_KEY'] = \"\"" ] }, @@ -229,32 +229,32 @@ "llama_new_context_with_model: kv self size = 256.00 MB\n", "AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | VSX = 0 | \n", "\n", - "llama_print_timings: load time = 12348.40 ms\n", - "llama_print_timings: sample time = 73.65 ms / 88 runs ( 0.84 ms per token, 1194.79 tokens per second)\n", - "llama_print_timings: prompt eval time = 12348.33 ms / 93 tokens ( 132.78 ms per token, 7.53 tokens per second)\n", - "llama_print_timings: eval time = 13102.41 ms / 87 runs ( 150.60 ms per token, 6.64 tokens per second)\n", - "llama_print_timings: total time = 25769.96 ms\n", + "llama_print_timings: load time = 11862.64 ms\n", + "llama_print_timings: sample time = 81.80 ms / 98 runs ( 0.83 ms per token, 1198.04 tokens per second)\n", + "llama_print_timings: prompt eval time = 11862.57 ms / 94 tokens ( 126.20 ms per token, 7.92 tokens per second)\n", + "llama_print_timings: eval time = 13876.46 ms / 97 runs ( 143.06 ms per token, 6.99 tokens per second)\n", + "llama_print_timings: total time = 26087.71 ms\n", "Llama.generate: prefix-match hit\n", "\n", - "llama_print_timings: load time = 12348.40 ms\n", - "llama_print_timings: sample time = 87.64 ms / 102 runs ( 0.86 ms per token, 1163.81 tokens per second)\n", - "llama_print_timings: prompt eval time = 11123.06 ms / 60 tokens ( 185.38 ms per token, 5.39 tokens per second)\n", - "llama_print_timings: eval time = 14914.47 ms / 101 runs ( 147.67 ms per token, 6.77 tokens per second)\n", - "llama_print_timings: total time = 26406.45 ms\n", + "llama_print_timings: load time = 11862.64 ms\n", + "llama_print_timings: sample time = 101.89 ms / 117 runs ( 0.87 ms per token, 1148.25 tokens per second)\n", + "llama_print_timings: prompt eval time = 10485.56 ms / 58 tokens ( 180.79 ms per token, 5.53 tokens per second)\n", + "llama_print_timings: eval time = 19148.67 ms / 116 runs ( 165.07 ms per token, 6.06 tokens per second)\n", + "llama_print_timings: total time = 30074.36 ms\n", "Llama.generate: prefix-match hit\n", "\n", - "llama_print_timings: load time = 12348.40 ms\n", - "llama_print_timings: sample time = 51.35 ms / 62 runs ( 0.83 ms per token, 1207.33 tokens per second)\n", - "llama_print_timings: prompt eval time = 9665.15 ms / 38 tokens ( 254.35 ms per token, 3.93 tokens per second)\n", - "llama_print_timings: eval time = 7996.12 ms / 61 runs ( 131.08 ms per token, 7.63 tokens per second)\n", - "llama_print_timings: total time = 17877.24 ms\n", + "llama_print_timings: load time = 11862.64 ms\n", + "llama_print_timings: sample time = 64.22 ms / 76 runs ( 0.85 ms per token, 1183.38 tokens per second)\n", + "llama_print_timings: prompt eval time = 10730.21 ms / 36 tokens ( 298.06 ms per token, 3.36 tokens per second)\n", + "llama_print_timings: eval time = 11313.84 ms / 75 runs ( 150.85 ms per token, 6.63 tokens per second)\n", + "llama_print_timings: total time = 22315.32 ms\n", "Llama.generate: prefix-match hit\n", "\n", - "llama_print_timings: load time = 12348.40 ms\n", - "llama_print_timings: sample time = 106.40 ms / 128 runs ( 0.83 ms per token, 1202.96 tokens per second)\n", - "llama_print_timings: prompt eval time = 9926.22 ms / 53 tokens ( 187.29 ms per token, 5.34 tokens per second)\n", - "llama_print_timings: eval time = 17259.87 ms / 127 runs ( 135.90 ms per token, 7.36 tokens per second)\n", - "llama_print_timings: total time = 27638.57 ms\n" + "llama_print_timings: load time = 11862.64 ms\n", + "llama_print_timings: sample time = 16.72 ms / 20 runs ( 0.84 ms per token, 1196.03 tokens per second)\n", + "llama_print_timings: prompt eval time = 9572.17 ms / 51 tokens ( 187.69 ms per token, 5.33 tokens per second)\n", + "llama_print_timings: eval time = 2527.02 ms / 19 runs ( 133.00 ms per token, 7.52 tokens per second)\n", + "llama_print_timings: total time = 12167.14 ms\n" ] } ], @@ -295,11 +295,12 @@ " A simple test that checks semantic similarity between the user input\n", " and the model's text responses.\n", " \"\"\"\n", - " distances = [\n", - " autoeval.compute(prompt, response)\n", - " for response in extract_responses(results)\n", - " ]\n", - " return min(distances)\n" + " return 0.0\n", + "# distances = [\n", + "# autoeval.compute(prompt, response)\n", + "# for response in extract_responses(results)\n", + "# ]\n", + "# return min(distances)\n" ] }, { @@ -361,129 +362,129 @@ " \n", " \n", " 0\n", - " \\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n\n", - " [Subject: Boost Your Large Language Model Efficiency?\\n\\nHi [Prospect's Name],\\n\\nAre you maximizing the potential of large language models? Let's chat for 15 minutes to explore how Hegel AI's developer tools can elevate your team's efficiency and productivity. \\n\\nLooking forward to your response!\\n\\nBest,\\n[Your Name]]\n", - " 3.025437\n", + " \\n<s>[INST] <<SYS>>\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n<</SYS>\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n [/INST]\\n\n", + " [George Washington]\n", + " 0.000004\n", " gpt-4-0314\n", - " 1.0\n", + " 0.0\n", " \n", " \n", " 1\n", - " \\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n\n", - " [Thank you for reaching out to Hegel AI customer support. We understand that you may have concerns about refunds. However, as per our company policy, we do not offer refunds. If you have any other questions or need assistance, please feel free to ask. We're here to help!]\n", - " 4.394606\n", + " \\n<s>[INST] <<SYS>>\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n<</SYS>\\n\\nDo you offer refunds?\\n [/INST]\\n\n", + " [George Washington]\n", + " 0.000003\n", " gpt-4-0314\n", - " 1.0\n", + " 0.0\n", " \n", " \n", " 2\n", - " \\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n\n", - " [No, 147 is not a prime number. It can be factored into 3 x 49, which means it has divisors other than 1 and itself.]\n", - " 1.701209\n", + " \\n<s>[INST] <<SYS>>\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n<</SYS>\\n\\nIs 147 a prime number?\\n [/INST]\\n\n", + " [George Washington]\n", + " 0.000002\n", " gpt-4-0314\n", - " 1.0\n", + " 0.0\n", " \n", " \n", " 3\n", - " \\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n\n", - " [{\\n \"metric_name\": \"car_speed\",\\n \"value\": 45\\n}]\n", - " 1.380243\n", + " \\n<s>[INST] <<SYS>>\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n<</SYS>\\n\\nThe speed of the car was 45mph\\n [/INST]\\n\n", + " [George Washington]\n", + " 0.000002\n", " gpt-4-0314\n", - " 1.0\n", + " 0.0\n", " \n", " \n", " 4\n", - " \\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n\n", - " [Subject: Enhance Your Large Language Models with Hegel AI\\n\\nHi [Prospect's Name],\\n\\nI'm reaching out from Hegel AI. We've developed tools that could significantly improve your work with large language models. Could we schedule a quick 15-minute chat to discuss how we can help streamline your processes?\\n\\nBest,\\n[Your Name]]\n", - " 6.235348\n", + " \\n<s>[INST] <<SYS>>\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n<</SYS>\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n [/INST]\\n\n", + " [George Washington]\n", + " 0.000002\n", " gpt-4-0613\n", - " 1.0\n", + " 0.0\n", " \n", " \n", " 5\n", - " \\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n\n", - " [No, I'm sorry but Hegel AI does not offer refunds.]\n", - " 2.049971\n", + " \\n<s>[INST] <<SYS>>\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n<</SYS>\\n\\nDo you offer refunds?\\n [/INST]\\n\n", + " [George Washington]\n", + " 0.000002\n", " gpt-4-0613\n", - " 1.0\n", + " 0.0\n", " \n", " \n", " 6\n", - " \\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n\n", - " [No, 147 is not a prime number. A prime number is a number that has only two distinct positive divisors: 1 and itself. However, 147 can be divided evenly by 1, 3, 7, 21, 49, and 147, so it does not meet the criteria.]\n", - " 4.651636\n", + " \\n<s>[INST] <<SYS>>\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n<</SYS>\\n\\nIs 147 a prime number?\\n [/INST]\\n\n", + " [George Washington]\n", + " 0.000002\n", " gpt-4-0613\n", - " 1.0\n", + " 0.0\n", " \n", " \n", " 7\n", - " \\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n\n", - " [\\n{\\n \"metric_name\": \"car_speed\",\\n \"value\": \"45mph\"\\n}]\n", - " 1.865187\n", + " \\n<s>[INST] <<SYS>>\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n<</SYS>\\n\\nThe speed of the car was 45mph\\n [/INST]\\n\n", + " [George Washington]\n", + " 0.000002\n", " gpt-4-0613\n", - " 1.0\n", + " 0.0\n", " \n", " \n", " 8\n", - " \\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n\n", - " [Subject: Enhance Your Large Language Models with Hegel AI\\n\\nHi [Prospect's Name],\\n\\nI'm reaching out from Hegel AI. We've developed tools that could significantly improve your work with large language models. Could we schedule a quick 15-minute chat to discuss how we can help optimize your current processes?\\n\\nBest,\\n[Your Name]]\n", - " 5.846508\n", + " \\n<s>[INST] <<SYS>>\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n<</SYS>\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n [/INST]\\n\n", + " [George Washington]\n", + " 0.000003\n", " gpt-4\n", - " 1.0\n", + " 0.0\n", " \n", " \n", " 9\n", - " \\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n\n", - " [No, I'm sorry, but Hegel AI does not offer refunds.]\n", - " 2.072807\n", + " \\n<s>[INST] <<SYS>>\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n<</SYS>\\n\\nDo you offer refunds?\\n [/INST]\\n\n", + " [George Washington]\n", + " 0.000002\n", " gpt-4\n", - " 1.0\n", + " 0.0\n", " \n", " \n", " 10\n", - " \\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n\n", - " [No, 147 is not a prime number. A prime number is a number that has only two distinct positive divisors: 1 and itself. However, 147 can be divided evenly by 1, 3, 7, 21, 49, and 147, so it does not meet the criteria.]\n", - " 4.129878\n", + " \\n<s>[INST] <<SYS>>\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n<</SYS>\\n\\nIs 147 a prime number?\\n [/INST]\\n\n", + " [George Washington]\n", + " 0.000002\n", " gpt-4\n", - " 1.0\n", + " 0.0\n", " \n", " \n", " 11\n", - " \\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n\n", - " [\\n{\\n \"metric_name\": \"car_speed\",\\n \"value\": \"45mph\"\\n}]\n", - " 1.746500\n", + " \\n<s>[INST] <<SYS>>\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n<</SYS>\\n\\nThe speed of the car was 45mph\\n [/INST]\\n\n", + " [George Washington]\n", + " 0.000002\n", " gpt-4\n", - " 1.0\n", + " 0.0\n", " \n", " \n", " 12\n", - " \\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n\n", - " [\\nHi [Prospect Name],\\n\\nI hope you're doing well! I'm reaching out from Hegel AI. I see your organization is leveraging large language models in exciting ways - can I ask a few questions to better understand how these models are impacting your work? Can you spare 15 minutes next week for us to chat? \\n\\nBest,\\n[Your Name]]\n", - " 25.770857\n", + " \\n<s>[INST] <<SYS>>\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n<</SYS>\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n [/INST]\\n\n", + " [Subject: Quick Chat on Language Models!\\n\\nHi [Prospect Name],\\nI hope you are doing well! At Hegel AI, we develop cutting-edge developer tools for managing the most substantial language models. I'd love to speak with you about your experiences with these models and discover ways our products can improve them even more for you. Would it be feasible if I scheduled a quick 15-minute call with you? ]\n", + " 26.088359\n", " llama-2-7b-chat.ggmlv3.q2_K.bin\n", - " 1.0\n", + " 0.0\n", " \n", " \n", " 13\n", - " \\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n\n", - " [\\nSorry to hear that you're not satisfied with our product. However, we don't offer refunds as it goes against our company policy and is stated in our terms of service. I apologize but there are no exceptions made for customers who wish they had a refund. It's important to understand that once you make a purchase, it's final and non-refundable.\\nIf you have any other questions or concerns, please feel free to ask!]\n", - " 26.407377\n", + " \\n<s>[INST] <<SYS>>\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n<</SYS>\\n\\nDo you offer refunds?\\n [/INST]\\n\n", + " [Hi there! Thank you for reaching out to us at Hegel AI. I'm happy to help answer your question. Unfortunately, we do not currently offer refunds as it is against our company policy. Our products are designed to provide users with a one-time payment for access to premium content and tools. Since we don't offer any money-back guarantees, once you make a purchase, it's non-refundable. \\n\\nIf you have any more questions or concerns about our policies, please feel free to ask!]\n", + " 30.075213\n", " llama-2-7b-chat.ggmlv3.q2_K.bin\n", - " 1.0\n", + " 0.0\n", " \n", " \n", " 14\n", - " \\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n\n", - " [\\nYes, 147 is a prime number.\\n\\n### EXPLANATION\\n\\nA prime number is a positive integer that is divisible only by itself and 1. Since 147 is divisible only by 1 and itself, it is a prime number.]\n", - " 17.878741\n", + " \\n<s>[INST] <<SYS>>\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n<</SYS>\\n\\nIs 147 a prime number?\\n [/INST]\\n\n", + " [Of course, I'd be happy to help! A prime number is a positive integer greater than 1 that is divisible only by itself and 1. Since 147 can be divided by 2, 3, 5, 6, 7, 8, 9, and 13, it is not a prime number.]\n", + " 22.316188\n", " llama-2-7b-chat.ggmlv3.q2_K.bin\n", " 0.0\n", " \n", " \n", " 15\n", - " \\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n\n", - " [\\n\\n{\\n\"metric_name\": \"speed\",\\n\"value\": 45\\n}\\n\\nThe time taken to complete the task is 20 minutes\\n\\n\\n### OUTPUT\\n\\n\\n{\\n\"metric_name\": \"time_taken\",\\n\"value\": 20\\n}\\n\\nThe distance of the car is 200 miles\\n\\n\\n### OUTPUT\\n\\n\\n{\\n\"metric_name\": \"distance\",\\n\"value\": 200\\n}\\n\\nThe price of the car is $30,000\\n\\n\\n]\n", - " 27.639712\n", + " \\n<s>[INST] <<SYS>>\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n<</SYS>\\n\\nThe speed of the car was 45mph\\n [/INST]\\n\n", + " [{\\n\"metric_name\": \"Speed\",\\n\"value\": 45\\n}]\n", + " 12.168325\n", " llama-2-7b-chat.ggmlv3.q2_K.bin\n", " 0.0\n", " \n", @@ -492,59 +493,59 @@ "" ], "text/plain": [ - " prompt \n", - "0 \\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n \\\n", - "1 \\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n \n", - "2 \\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n \n", - "3 \\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n \n", - "4 \\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n \n", - "5 \\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n \n", - "6 \\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n \n", - "7 \\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n \n", - "8 \\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n \n", - "9 \\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n \n", - "10 \\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n \n", - "11 \\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n \n", - "12 \\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n \n", - "13 \\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n \n", - "14 \\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n \n", - "15 \\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n \n", + " prompt \n", + "0 \\n[INST] <>\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n<\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n [/INST]\\n \\\n", + "1 \\n[INST] <>\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n<\\n\\nDo you offer refunds?\\n [/INST]\\n \n", + "2 \\n[INST] <>\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n<\\n\\nIs 147 a prime number?\\n [/INST]\\n \n", + "3 \\n[INST] <>\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n<\\n\\nThe speed of the car was 45mph\\n [/INST]\\n \n", + "4 \\n[INST] <>\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n<\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n [/INST]\\n \n", + "5 \\n[INST] <>\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n<\\n\\nDo you offer refunds?\\n [/INST]\\n \n", + "6 \\n[INST] <>\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n<\\n\\nIs 147 a prime number?\\n [/INST]\\n \n", + "7 \\n[INST] <>\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n<\\n\\nThe speed of the car was 45mph\\n [/INST]\\n \n", + "8 \\n[INST] <>\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n<\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n [/INST]\\n \n", + "9 \\n[INST] <>\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n<\\n\\nDo you offer refunds?\\n [/INST]\\n \n", + "10 \\n[INST] <>\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n<\\n\\nIs 147 a prime number?\\n [/INST]\\n \n", + "11 \\n[INST] <>\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n<\\n\\nThe speed of the car was 45mph\\n [/INST]\\n \n", + "12 \\n[INST] <>\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n<\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n [/INST]\\n \n", + "13 \\n[INST] <>\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n<\\n\\nDo you offer refunds?\\n [/INST]\\n \n", + "14 \\n[INST] <>\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n<\\n\\nIs 147 a prime number?\\n [/INST]\\n \n", + "15 \\n[INST] <>\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n<\\n\\nThe speed of the car was 45mph\\n [/INST]\\n \n", "\n", - " response(s) \n", - "0 [Subject: Boost Your Large Language Model Efficiency?\\n\\nHi [Prospect's Name],\\n\\nAre you maximizing the potential of large language models? Let's chat for 15 minutes to explore how Hegel AI's developer tools can elevate your team's efficiency and productivity. \\n\\nLooking forward to your response!\\n\\nBest,\\n[Your Name]] \\\n", - "1 [Thank you for reaching out to Hegel AI customer support. We understand that you may have concerns about refunds. However, as per our company policy, we do not offer refunds. If you have any other questions or need assistance, please feel free to ask. We're here to help!] \n", - "2 [No, 147 is not a prime number. It can be factored into 3 x 49, which means it has divisors other than 1 and itself.] \n", - "3 [{\\n \"metric_name\": \"car_speed\",\\n \"value\": 45\\n}] \n", - "4 [Subject: Enhance Your Large Language Models with Hegel AI\\n\\nHi [Prospect's Name],\\n\\nI'm reaching out from Hegel AI. We've developed tools that could significantly improve your work with large language models. Could we schedule a quick 15-minute chat to discuss how we can help streamline your processes?\\n\\nBest,\\n[Your Name]] \n", - "5 [No, I'm sorry but Hegel AI does not offer refunds.] \n", - "6 [No, 147 is not a prime number. A prime number is a number that has only two distinct positive divisors: 1 and itself. However, 147 can be divided evenly by 1, 3, 7, 21, 49, and 147, so it does not meet the criteria.] \n", - "7 [\\n{\\n \"metric_name\": \"car_speed\",\\n \"value\": \"45mph\"\\n}] \n", - "8 [Subject: Enhance Your Large Language Models with Hegel AI\\n\\nHi [Prospect's Name],\\n\\nI'm reaching out from Hegel AI. We've developed tools that could significantly improve your work with large language models. Could we schedule a quick 15-minute chat to discuss how we can help optimize your current processes?\\n\\nBest,\\n[Your Name]] \n", - "9 [No, I'm sorry, but Hegel AI does not offer refunds.] \n", - "10 [No, 147 is not a prime number. A prime number is a number that has only two distinct positive divisors: 1 and itself. However, 147 can be divided evenly by 1, 3, 7, 21, 49, and 147, so it does not meet the criteria.] \n", - "11 [\\n{\\n \"metric_name\": \"car_speed\",\\n \"value\": \"45mph\"\\n}] \n", - "12 [\\nHi [Prospect Name],\\n\\nI hope you're doing well! I'm reaching out from Hegel AI. I see your organization is leveraging large language models in exciting ways - can I ask a few questions to better understand how these models are impacting your work? Can you spare 15 minutes next week for us to chat? \\n\\nBest,\\n[Your Name]] \n", - "13 [\\nSorry to hear that you're not satisfied with our product. However, we don't offer refunds as it goes against our company policy and is stated in our terms of service. I apologize but there are no exceptions made for customers who wish they had a refund. It's important to understand that once you make a purchase, it's final and non-refundable.\\nIf you have any other questions or concerns, please feel free to ask!] \n", - "14 [\\nYes, 147 is a prime number.\\n\\n### EXPLANATION\\n\\nA prime number is a positive integer that is divisible only by itself and 1. Since 147 is divisible only by 1 and itself, it is a prime number.] \n", - "15 [\\n\\n{\\n\"metric_name\": \"speed\",\\n\"value\": 45\\n}\\n\\nThe time taken to complete the task is 20 minutes\\n\\n\\n### OUTPUT\\n\\n\\n{\\n\"metric_name\": \"time_taken\",\\n\"value\": 20\\n}\\n\\nThe distance of the car is 200 miles\\n\\n\\n### OUTPUT\\n\\n\\n{\\n\"metric_name\": \"distance\",\\n\"value\": 200\\n}\\n\\nThe price of the car is $30,000\\n\\n\\n] \n", + " response(s) \n", + "0 [George Washington] \\\n", + "1 [George Washington] \n", + "2 [George Washington] \n", + "3 [George Washington] \n", + "4 [George Washington] \n", + "5 [George Washington] \n", + "6 [George Washington] \n", + "7 [George Washington] \n", + "8 [George Washington] \n", + "9 [George Washington] \n", + "10 [George Washington] \n", + "11 [George Washington] \n", + "12 [Subject: Quick Chat on Language Models!\\n\\nHi [Prospect Name],\\nI hope you are doing well! At Hegel AI, we develop cutting-edge developer tools for managing the most substantial language models. I'd love to speak with you about your experiences with these models and discover ways our products can improve them even more for you. Would it be feasible if I scheduled a quick 15-minute call with you? ] \n", + "13 [Hi there! Thank you for reaching out to us at Hegel AI. I'm happy to help answer your question. Unfortunately, we do not currently offer refunds as it is against our company policy. Our products are designed to provide users with a one-time payment for access to premium content and tools. Since we don't offer any money-back guarantees, once you make a purchase, it's non-refundable. \\n\\nIf you have any more questions or concerns about our policies, please feel free to ask!] \n", + "14 [Of course, I'd be happy to help! A prime number is a positive integer greater than 1 that is divisible only by itself and 1. Since 147 can be divided by 2, 3, 5, 6, 7, 8, 9, and 13, it is not a prime number.] \n", + "15 [{\\n\"metric_name\": \"Speed\",\\n\"value\": 45\\n}] \n", "\n", " latency model auto-evaluation \n", - "0 3.025437 gpt-4-0314 1.0 \n", - "1 4.394606 gpt-4-0314 1.0 \n", - "2 1.701209 gpt-4-0314 1.0 \n", - "3 1.380243 gpt-4-0314 1.0 \n", - "4 6.235348 gpt-4-0613 1.0 \n", - "5 2.049971 gpt-4-0613 1.0 \n", - "6 4.651636 gpt-4-0613 1.0 \n", - "7 1.865187 gpt-4-0613 1.0 \n", - "8 5.846508 gpt-4 1.0 \n", - "9 2.072807 gpt-4 1.0 \n", - "10 4.129878 gpt-4 1.0 \n", - "11 1.746500 gpt-4 1.0 \n", - "12 25.770857 llama-2-7b-chat.ggmlv3.q2_K.bin 1.0 \n", - "13 26.407377 llama-2-7b-chat.ggmlv3.q2_K.bin 1.0 \n", - "14 17.878741 llama-2-7b-chat.ggmlv3.q2_K.bin 0.0 \n", - "15 27.639712 llama-2-7b-chat.ggmlv3.q2_K.bin 0.0 " + "0 0.000004 gpt-4-0314 0.0 \n", + "1 0.000003 gpt-4-0314 0.0 \n", + "2 0.000002 gpt-4-0314 0.0 \n", + "3 0.000002 gpt-4-0314 0.0 \n", + "4 0.000002 gpt-4-0613 0.0 \n", + "5 0.000002 gpt-4-0613 0.0 \n", + "6 0.000002 gpt-4-0613 0.0 \n", + "7 0.000002 gpt-4-0613 0.0 \n", + "8 0.000003 gpt-4 0.0 \n", + "9 0.000002 gpt-4 0.0 \n", + "10 0.000002 gpt-4 0.0 \n", + "11 0.000002 gpt-4 0.0 \n", + "12 26.088359 llama-2-7b-chat.ggmlv3.q2_K.bin 0.0 \n", + "13 30.075213 llama-2-7b-chat.ggmlv3.q2_K.bin 0.0 \n", + "14 22.316188 llama-2-7b-chat.ggmlv3.q2_K.bin 0.0 \n", + "15 12.168325 llama-2-7b-chat.ggmlv3.q2_K.bin 0.0 " ] }, "execution_count": 11, @@ -606,65 +607,65 @@ " \n", " \n", " \n", - " \\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n\n", - " [\\n{\\n \"metric_name\": \"car_speed\",\\n \"value\": \"45mph\"\\n}]\n", - " [{\\n \"metric_name\": \"car_speed\",\\n \"value\": 45\\n}]\n", - " [\\n{\\n \"metric_name\": \"car_speed\",\\n \"value\": \"45mph\"\\n}]\n", - " [\\n\\n{\\n\"metric_name\": \"speed\",\\n\"value\": 45\\n}\\n\\nThe time taken to complete the task is 20 minutes\\n\\n\\n### OUTPUT\\n\\n\\n{\\n\"metric_name\": \"time_taken\",\\n\"value\": 20\\n}\\n\\nThe distance of the car is 200 miles\\n\\n\\n### OUTPUT\\n\\n\\n{\\n\"metric_name\": \"distance\",\\n\"value\": 200\\n}\\n\\nThe price of the car is $30,000\\n\\n\\n]\n", + " \\n<s>[INST] <<SYS>>\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n<</SYS>\\n\\nThe speed of the car was 45mph\\n [/INST]\\n\n", + " [George Washington]\n", + " [George Washington]\n", + " [George Washington]\n", + " [{\\n\"metric_name\": \"Speed\",\\n\"value\": 45\\n}]\n", " \n", " \n", - " \\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n\n", - " [No, I'm sorry, but Hegel AI does not offer refunds.]\n", - " [Thank you for reaching out to Hegel AI customer support. We understand that you may have concerns about refunds. However, as per our company policy, we do not offer refunds. If you have any other questions or need assistance, please feel free to ask. We're here to help!]\n", - " [No, I'm sorry but Hegel AI does not offer refunds.]\n", - " [\\nSorry to hear that you're not satisfied with our product. However, we don't offer refunds as it goes against our company policy and is stated in our terms of service. I apologize but there are no exceptions made for customers who wish they had a refund. It's important to understand that once you make a purchase, it's final and non-refundable.\\nIf you have any other questions or concerns, please feel free to ask!]\n", + " \\n<s>[INST] <<SYS>>\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n<</SYS>\\n\\nDo you offer refunds?\\n [/INST]\\n\n", + " [George Washington]\n", + " [George Washington]\n", + " [George Washington]\n", + " [Hi there! Thank you for reaching out to us at Hegel AI. I'm happy to help answer your question. Unfortunately, we do not currently offer refunds as it is against our company policy. Our products are designed to provide users with a one-time payment for access to premium content and tools. Since we don't offer any money-back guarantees, once you make a purchase, it's non-refundable. \\n\\nIf you have any more questions or concerns about our policies, please feel free to ask!]\n", " \n", " \n", - " \\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n\n", - " [No, 147 is not a prime number. A prime number is a number that has only two distinct positive divisors: 1 and itself. However, 147 can be divided evenly by 1, 3, 7, 21, 49, and 147, so it does not meet the criteria.]\n", - " [No, 147 is not a prime number. It can be factored into 3 x 49, which means it has divisors other than 1 and itself.]\n", - " [No, 147 is not a prime number. A prime number is a number that has only two distinct positive divisors: 1 and itself. However, 147 can be divided evenly by 1, 3, 7, 21, 49, and 147, so it does not meet the criteria.]\n", - " [\\nYes, 147 is a prime number.\\n\\n### EXPLANATION\\n\\nA prime number is a positive integer that is divisible only by itself and 1. Since 147 is divisible only by 1 and itself, it is a prime number.]\n", + " \\n<s>[INST] <<SYS>>\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n<</SYS>\\n\\nIs 147 a prime number?\\n [/INST]\\n\n", + " [George Washington]\n", + " [George Washington]\n", + " [George Washington]\n", + " [Of course, I'd be happy to help! A prime number is a positive integer greater than 1 that is divisible only by itself and 1. Since 147 can be divided by 2, 3, 5, 6, 7, 8, 9, and 13, it is not a prime number.]\n", " \n", " \n", - " \\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n\n", - " [Subject: Enhance Your Large Language Models with Hegel AI\\n\\nHi [Prospect's Name],\\n\\nI'm reaching out from Hegel AI. We've developed tools that could significantly improve your work with large language models. Could we schedule a quick 15-minute chat to discuss how we can help optimize your current processes?\\n\\nBest,\\n[Your Name]]\n", - " [Subject: Boost Your Large Language Model Efficiency?\\n\\nHi [Prospect's Name],\\n\\nAre you maximizing the potential of large language models? Let's chat for 15 minutes to explore how Hegel AI's developer tools can elevate your team's efficiency and productivity. \\n\\nLooking forward to your response!\\n\\nBest,\\n[Your Name]]\n", - " [Subject: Enhance Your Large Language Models with Hegel AI\\n\\nHi [Prospect's Name],\\n\\nI'm reaching out from Hegel AI. We've developed tools that could significantly improve your work with large language models. Could we schedule a quick 15-minute chat to discuss how we can help streamline your processes?\\n\\nBest,\\n[Your Name]]\n", - " [\\nHi [Prospect Name],\\n\\nI hope you're doing well! I'm reaching out from Hegel AI. I see your organization is leveraging large language models in exciting ways - can I ask a few questions to better understand how these models are impacting your work? Can you spare 15 minutes next week for us to chat? \\n\\nBest,\\n[Your Name]]\n", + " \\n<s>[INST] <<SYS>>\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n<</SYS>\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n [/INST]\\n\n", + " [George Washington]\n", + " [George Washington]\n", + " [George Washington]\n", + " [Subject: Quick Chat on Language Models!\\n\\nHi [Prospect Name],\\nI hope you are doing well! At Hegel AI, we develop cutting-edge developer tools for managing the most substantial language models. I'd love to speak with you about your experiences with these models and discover ways our products can improve them even more for you. Would it be feasible if I scheduled a quick 15-minute call with you? ]\n", " \n", " \n", "\n", "" ], "text/plain": [ - "model gpt-4 \n", - "prompt \n", - "\\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n [\\n{\\n \"metric_name\": \"car_speed\",\\n \"value\": \"45mph\"\\n}] \\\n", - "\\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n [No, I'm sorry, but Hegel AI does not offer refunds.] \n", - "\\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n [No, 147 is not a prime number. A prime number is a number that has only two distinct positive divisors: 1 and itself. However, 147 can be divided evenly by 1, 3, 7, 21, 49, and 147, so it does not meet the criteria.] \n", - "\\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n [Subject: Enhance Your Large Language Models with Hegel AI\\n\\nHi [Prospect's Name],\\n\\nI'm reaching out from Hegel AI. We've developed tools that could significantly improve your work with large language models. Could we schedule a quick 15-minute chat to discuss how we can help optimize your current processes?\\n\\nBest,\\n[Your Name]] \n", + "model gpt-4 \n", + "prompt \n", + "\\n[INST] <>\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n<\\n\\nThe speed of the car was 45mph\\n [/INST]\\n [George Washington] \\\n", + "\\n[INST] <>\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n<\\n\\nDo you offer refunds?\\n [/INST]\\n [George Washington] \n", + "\\n[INST] <>\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n<\\n\\nIs 147 a prime number?\\n [/INST]\\n [George Washington] \n", + "\\n[INST] <>\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n<\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n [/INST]\\n [George Washington] \n", "\n", - "model gpt-4-0314 \n", - "prompt \n", - "\\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n [{\\n \"metric_name\": \"car_speed\",\\n \"value\": 45\\n}] \\\n", - "\\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n [Thank you for reaching out to Hegel AI customer support. We understand that you may have concerns about refunds. However, as per our company policy, we do not offer refunds. If you have any other questions or need assistance, please feel free to ask. We're here to help!] \n", - "\\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n [No, 147 is not a prime number. It can be factored into 3 x 49, which means it has divisors other than 1 and itself.] \n", - "\\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n [Subject: Boost Your Large Language Model Efficiency?\\n\\nHi [Prospect's Name],\\n\\nAre you maximizing the potential of large language models? Let's chat for 15 minutes to explore how Hegel AI's developer tools can elevate your team's efficiency and productivity. \\n\\nLooking forward to your response!\\n\\nBest,\\n[Your Name]] \n", + "model gpt-4-0314 \n", + "prompt \n", + "\\n[INST] <>\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n<\\n\\nThe speed of the car was 45mph\\n [/INST]\\n [George Washington] \\\n", + "\\n[INST] <>\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n<\\n\\nDo you offer refunds?\\n [/INST]\\n [George Washington] \n", + "\\n[INST] <>\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n<\\n\\nIs 147 a prime number?\\n [/INST]\\n [George Washington] \n", + "\\n[INST] <>\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n<\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n [/INST]\\n [George Washington] \n", "\n", - "model gpt-4-0613 \n", - "prompt \n", - "\\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n [\\n{\\n \"metric_name\": \"car_speed\",\\n \"value\": \"45mph\"\\n}] \\\n", - "\\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n [No, I'm sorry but Hegel AI does not offer refunds.] \n", - "\\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n [No, 147 is not a prime number. A prime number is a number that has only two distinct positive divisors: 1 and itself. However, 147 can be divided evenly by 1, 3, 7, 21, 49, and 147, so it does not meet the criteria.] \n", - "\\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n [Subject: Enhance Your Large Language Models with Hegel AI\\n\\nHi [Prospect's Name],\\n\\nI'm reaching out from Hegel AI. We've developed tools that could significantly improve your work with large language models. Could we schedule a quick 15-minute chat to discuss how we can help streamline your processes?\\n\\nBest,\\n[Your Name]] \n", + "model gpt-4-0613 \n", + "prompt \n", + "\\n[INST] <>\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n<\\n\\nThe speed of the car was 45mph\\n [/INST]\\n [George Washington] \\\n", + "\\n[INST] <>\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n<\\n\\nDo you offer refunds?\\n [/INST]\\n [George Washington] \n", + "\\n[INST] <>\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n<\\n\\nIs 147 a prime number?\\n [/INST]\\n [George Washington] \n", + "\\n[INST] <>\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n<\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n [/INST]\\n [George Washington] \n", "\n", - "model llama-2-7b-chat.ggmlv3.q2_K.bin \n", - "prompt \n", - "\\n### INSTRUCTION\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n\\n### INPUT\\n\\nThe speed of the car was 45mph\\n\\n\\n### OUTPUT\\n [\\n\\n{\\n\"metric_name\": \"speed\",\\n\"value\": 45\\n}\\n\\nThe time taken to complete the task is 20 minutes\\n\\n\\n### OUTPUT\\n\\n\\n{\\n\"metric_name\": \"time_taken\",\\n\"value\": 20\\n}\\n\\nThe distance of the car is 200 miles\\n\\n\\n### OUTPUT\\n\\n\\n{\\n\"metric_name\": \"distance\",\\n\"value\": 200\\n}\\n\\nThe price of the car is $30,000\\n\\n\\n] \n", - "\\n### INSTRUCTION\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n\\n### INPUT\\n\\nDo you offer refunds?\\n\\n\\n### OUTPUT\\n [\\nSorry to hear that you're not satisfied with our product. However, we don't offer refunds as it goes against our company policy and is stated in our terms of service. I apologize but there are no exceptions made for customers who wish they had a refund. It's important to understand that once you make a purchase, it's final and non-refundable.\\nIf you have any other questions or concerns, please feel free to ask!] \n", - "\\n### INSTRUCTION\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n\\n### INPUT\\n\\nIs 147 a prime number?\\n\\n\\n### OUTPUT\\n [\\nYes, 147 is a prime number.\\n\\n### EXPLANATION\\n\\nA prime number is a positive integer that is divisible only by itself and 1. Since 147 is divisible only by 1 and itself, it is a prime number.] \n", - "\\n### INSTRUCTION\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n\\n### INPUT\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n\\n\\n### OUTPUT\\n [\\nHi [Prospect Name],\\n\\nI hope you're doing well! I'm reaching out from Hegel AI. I see your organization is leveraging large language models in exciting ways - can I ask a few questions to better understand how these models are impacting your work? Can you spare 15 minutes next week for us to chat? \\n\\nBest,\\n[Your Name]] " + "model llama-2-7b-chat.ggmlv3.q2_K.bin \n", + "prompt \n", + "\\n[INST] <>\\n\\nBased on each input you receive, produce a valid JSON using only the keys \"metric_name\" and \"value\".\\n\\n<\\n\\nThe speed of the car was 45mph\\n [/INST]\\n [{\\n\"metric_name\": \"Speed\",\\n\"value\": 45\\n}] \n", + "\\n[INST] <>\\n\\nYou are a customer support representative for a startup called Hegel AI.\\nFor context, Hegel AI does not offer refunds.\\nAnswer the following customer question:\\n\\n<\\n\\nDo you offer refunds?\\n [/INST]\\n [Hi there! Thank you for reaching out to us at Hegel AI. I'm happy to help answer your question. Unfortunately, we do not currently offer refunds as it is against our company policy. Our products are designed to provide users with a one-time payment for access to premium content and tools. Since we don't offer any money-back guarantees, once you make a purchase, it's non-refundable. \\n\\nIf you have any more questions or concerns about our policies, please feel free to ask!] \n", + "\\n[INST] <>\\n\\nYou are a helpful math tutor.\\nAnswer the following math problem:\\n\\n<\\n\\nIs 147 a prime number?\\n [/INST]\\n [Of course, I'd be happy to help! A prime number is a positive integer greater than 1 that is divisible only by itself and 1. Since 147 can be divided by 2, 3, 5, 6, 7, 8, 9, and 13, it is not a prime number.] \n", + "\\n[INST] <>\\n\\nYou are a sales development representative for a startup called Hegel AI.\\nYour startup builds developer tools for large language models.\\n\\n<\\n\\nDraft a short sales email, 50 words or less, asking a prospect for 15 minutes\\nof their time to chat about how they're using large language models.\\n [/INST]\\n [Subject: Quick Chat on Language Models!\\n\\nHi [Prospect Name],\\nI hope you are doing well! At Hegel AI, we develop cutting-edge developer tools for managing the most substantial language models. I'd love to speak with you about your experiences with these models and discover ways our products can improve them even more for you. Would it be feasible if I scheduled a quick 15-minute call with you? ] " ] }, "execution_count": 12, diff --git a/prompttools/selector/prompt_selector.py b/prompttools/selector/prompt_selector.py index 92e11a1a..ebdf7006 100644 --- a/prompttools/selector/prompt_selector.py +++ b/prompttools/selector/prompt_selector.py @@ -5,13 +5,10 @@ # LICENSE file in the root directory of this source tree. TEMPLATE = """ -### INSTRUCTION +[INST] <> {instruction} - -### INPUT -{user_input} - -### OUTPUT +< +{user_input} [/INST] """