Add initial Benchmarking class (#72)

* benchmark boiler plate * rearrange files * benchmarking * hellaswag test data * cut down dataset * cleaned up notebook * doc string
hegelai · Aug 15, 2023 · 9a7902a · 9a7902a
1 parent c3d897a
commit 9a7902a
Show file tree

Hide file tree

Showing 5 changed files with 6,494 additions and 0 deletions.
diff --git a/examples/notebooks/benchmark/Benchmarking.ipynb b/examples/notebooks/benchmark/Benchmarking.ipynb
@@ -0,0 +1,385 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "5e607dbf",
+ "metadata": {},
+ "source": [
+ "# Benchmarking Using HellaSwag Dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "5d0b7ccc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ind: question index\n",
+ "# id: question id\n",
+ "# activity_label: A short phrase describing the events in the question\n",
+ "# ctx: The full context for the question\n",
+ "# ctx_a: The first sentence of the context\n",
+ "# ctx_b: The second sentence of the context\n",
+ "# dataset: Domain of the question -- e.g. activitynet / wikihow\n",
+ "# ending_options: A list of four ending choices"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "034294fe",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from prompttools.benchmarks import Benchmark\n",
+ "from prompttools.experiment import (\n",
+ " LlamaCppExperiment,\n",
+ " OpenAIChatExperiment,\n",
+ " HuggingFaceHubExperiment,\n",
+ ")\n",
+ "from prompttools.utils import semantic_similarity\n",
+ "\n",
+ "import pandas as pd\n",
+ "import datetime\n",
+ "import json"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d0d8ac2d",
+ "metadata": {},
+ "source": [
+ "## Setup HellaSwag Dataset for Benchmark"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "66ecf3ca",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Sample size to test\n",
+ "sample_size = 3\n",
+ "\n",
+ "data = []\n",
+ "with open('prompttools/data/benchmarking/hellaswag/hellaswag_dataset.jsonl', 'r') as file:\n",
+ " for line in file:\n",
+ " try:\n",
+ " json_obj = json.loads(line)\n",
+ " data.append([json_obj['ctx'], json_obj['ending_options']])\n",
+ " except json.JSONDecodeError:\n",
+ " print(f\"Skipped invalid JSON: {line}\")\n",
+ "labels = []\n",
+ "with open('prompttools/data/benchmarking/hellaswag/hellaswag_labels.lst', 'r') as file:\n",
+ " for line in file:\n",
+ " try:\n",
+ " json_obj = json.loads(line)\n",
+ " labels.append(json_obj)\n",
+ " except json.JSONDecodeError:\n",
+ " print(f\"Skipped invalid JSON: {line}\")\n",
+ "\n",
+ "hella_swag = pd.DataFrame(data, columns=['ctx', 'ending_options'])\n",
+ "hella_swag[\"labels\"] = labels\n",
+ "hella_swag = hella_swag.head(sample_size) # sample\n",
+ "\n",
+ "sample_ctxs = hella_swag['ctx'].values\n",
+ "sample_ending_options = hella_swag['ending_options'].values\n",
+ "sample_labels = hella_swag['labels'].values"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a7c1e08f",
+ "metadata": {},
+ "source": [
+ "## Model Params"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "f0fb7b81",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "temperatures = [0.5]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1a76f2e1",
+ "metadata": {},
+ "source": [
+ "## Setup Experiments to Benchmark"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "669931ba",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Google Flan T5 XXL\n",
+ "models = [\"google/flan-t5-xxl\"]\n",
+ "prompts = sample_ctxs\n",
+ "task = [\"text-generation\"]\n",
+ "google_flan_t5_xxl_experiment = HuggingFaceHubExperiment(\n",
+ " models, prompts, task, temperature=temperatures\n",
+ ")\n",
+ "benchmarking_google_flan_t5_xxl = Benchmark(\n",
+ " experiment=google_flan_t5_xxl_experiment,\n",
+ " eval_method=semantic_similarity,\n",
+ " prompts=sample_ctxs,\n",
+ " response_options=sample_ending_options,\n",
+ " correct_response_indices=sample_labels\n",
+ ")\n",
+ "\n",
+ "# Vicuna 7b\n",
+ "vicuna7b_experiment = LlamaCppExperiment(\n",
+ " [\n",
+ " \"../llama/llama.cpp/models/7b/ggml-vicuna-7b-1.1-q4_0.bin\",\n",
+ " ],\n",
+ " sample_ctxs,\n",
+ " call_params=dict(temperature=temperatures),\n",
+ ")\n",
+ "benchmarking_vicuna7b = Benchmark(\n",
+ " experiment=vicuna7b_experiment,\n",
+ " eval_method=semantic_similarity,\n",
+ " prompts=sample_ctxs,\n",
+ " response_options=sample_ending_options,\n",
+ " correct_response_indices=sample_labels\n",
+ ")\n",
+ "\n",
+ "# Vicuna 13b\n",
+ "vicuna13b_experiment = LlamaCppExperiment(\n",
+ " [\n",
+ " \"../llama/llama.cpp/models/13b/ggml-vic13b-uncensored-q4_0.bin\",\n",
+ " ],\n",
+ " sample_ctxs,\n",
+ " call_params=dict(temperature=temperatures),\n",
+ ")\n",
+ "benchmarking_vicuna13b = Benchmark(\n",
+ " experiment=vicuna13b_experiment,\n",
+ " eval_method=semantic_similarity,\n",
+ " prompts=sample_ctxs,\n",
+ " response_options=sample_ending_options,\n",
+ " correct_response_indices=sample_labels\n",
+ ")\n",
+ "\n",
+ "# OpenAI Chat\n",
+ "openai_chat_experiment = OpenAIChatExperiment(\n",
+ " [\"gpt-3.5-turbo\"],\n",
+ " [\n",
+ " [{\"role\": \"system\", \"content\": c}]\n",
+ " for c in sample_ctxs\n",
+ " ],\n",
+ " temperature=temperatures\n",
+ ")\n",
+ "benchmarking_openai_chat = Benchmark(\n",
+ " experiment=openai_chat_experiment,\n",
+ " eval_method=semantic_similarity,\n",
+ " prompts=sample_ctxs,\n",
+ " response_options=sample_ending_options,\n",
+ " correct_response_indices=sample_labels,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3902310d",
+ "metadata": {},
+ "source": [
+ "## Run Benchmarks"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "9395b236",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Start time: 2023-08-14 22:13:49.096696\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "You're using a different task than the one specified in the repository. Be sure to know what you're doing :)\n",
+ "WARNING:huggingface_hub.inference_api:You're using a different task than the one specified in the repository. Be sure to know what you're doing :)\n",
+ "/home/hashem/Desktop/workspace/prompttools/prompttools/benchmarks/benchmark.py:82: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " benchmark_df[\"response_options\"] = self.response_options\n",
+ "llama.cpp: loading model from ../llama/llama.cpp/models/13b/ggml-vic13b-uncensored-q4_0.bin\n",
+ "llama_model_load_internal: format = ggjt v3 (latest)\n",
+ "llama_model_load_internal: n_vocab = 32000\n",
+ "llama_model_load_internal: n_ctx = 512\n",
+ "llama_model_load_internal: n_embd = 5120\n",
+ "llama_model_load_internal: n_mult = 256\n",
+ "llama_model_load_internal: n_head = 40\n",
+ "llama_model_load_internal: n_layer = 40\n",
+ "llama_model_load_internal: n_rot = 128\n",
+ "llama_model_load_internal: ftype = 2 (mostly Q4_0)\n",
+ "llama_model_load_internal: n_ff = 13824\n",
+ "llama_model_load_internal: model size = 13B\n",
+ "llama_model_load_internal: ggml ctx size = 0.09 MB\n",
+ "llama_model_load_internal: mem required = 9031.71 MB (+ 1608.00 MB per state)\n",
+ "AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n",
+ "llama_new_context_with_model: kv self size = 400.00 MB\n",
+ "\n",
+ "llama_print_timings: load time = 3131.00 ms\n",
+ "llama_print_timings: sample time = 48.04 ms / 44 runs ( 1.09 ms per token, 915.88 tokens per second)\n",
+ "Llama.generate: prefix-match hit\n",
+ "llama_print_timings: prompt eval time = 3130.95 ms / 27 tokens ( 115.96 ms per token, 8.62 tokens per second)\n",
+ "llama_print_timings: eval time = 8650.51 ms / 43 runs ( 201.17 ms per token, 4.97 tokens per second)\n",
+ "llama_print_timings: total time = 11952.95 ms\n",
+ "\n",
+ "llama_print_timings: load time = 3131.00 ms\n",
+ "llama_print_timings: sample time = 87.41 ms / 76 runs ( 1.15 ms per token, 869.45 tokens per second)\n",
+ "llama_print_timings: prompt eval time = 3275.57 ms / 25 tokens ( 131.02 ms per token, 7.63 tokens per second)\n",
+ "llama_print_timings: eval time = 14374.18 ms / 75 runs ( 191.66 ms per token, 5.22 tokens per second)\n",
+ "llama_print_timings: total time = 17955.90 ms\n",
+ "Llama.generate: prefix-match hit\n",
+ "\n",
+ "llama_print_timings: load time = 3131.00 ms\n",
+ "llama_print_timings: sample time = 21.62 ms / 18 runs ( 1.20 ms per token, 832.64 tokens per second)\n",
+ "llama_print_timings: prompt eval time = 2373.56 ms / 18 tokens ( 131.86 ms per token, 7.58 tokens per second)\n",
+ "llama_print_timings: eval time = 3258.29 ms / 17 runs ( 191.66 ms per token, 5.22 tokens per second)\n",
+ "llama_print_timings: total time = 5704.84 ms\n",
+ "/home/hashem/Desktop/workspace/prompttools/prompttools/benchmarks/benchmark.py:82: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " benchmark_df[\"response_options\"] = self.response_options\n",
+ "llama.cpp: loading model from ../llama/llama.cpp/models/13b/ggml-vic13b-uncensored-q4_0.bin\n",
+ "llama_model_load_internal: format = ggjt v3 (latest)\n",
+ "llama_model_load_internal: n_vocab = 32000\n",
+ "llama_model_load_internal: n_ctx = 512\n",
+ "llama_model_load_internal: n_embd = 5120\n",
+ "llama_model_load_internal: n_mult = 256\n",
+ "llama_model_load_internal: n_head = 40\n",
+ "llama_model_load_internal: n_layer = 40\n",
+ "llama_model_load_internal: n_rot = 128\n",
+ "llama_model_load_internal: ftype = 2 (mostly Q4_0)\n",
+ "llama_model_load_internal: n_ff = 13824\n",
+ "llama_model_load_internal: model size = 13B\n",
+ "llama_model_load_internal: ggml ctx size = 0.09 MB\n",
+ "llama_model_load_internal: mem required = 9031.71 MB (+ 1608.00 MB per state)\n",
+ "AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n",
+ "llama_new_context_with_model: kv self size = 400.00 MB\n",
+ "\n",
+ "llama_print_timings: load time = 3570.02 ms\n",
+ "llama_print_timings: sample time = 49.62 ms / 44 runs ( 1.13 ms per token, 886.79 tokens per second)\n",
+ "llama_print_timings: prompt eval time = 3569.99 ms / 27 tokens ( 132.22 ms per token, 7.56 tokens per second)\n",
+ "llama_print_timings: eval time = 8310.00 ms / 43 runs ( 193.26 ms per token, 5.17 tokens per second)\n",
+ "llama_print_timings: total time = 12056.50 ms\n",
+ "Llama.generate: prefix-match hit\n",
+ "\n",
+ "llama_print_timings: load time = 3570.02 ms\n",
+ "llama_print_timings: sample time = 87.72 ms / 76 runs ( 1.15 ms per token, 866.40 tokens per second)\n",
+ "llama_print_timings: prompt eval time = 3267.08 ms / 25 tokens ( 130.68 ms per token, 7.65 tokens per second)\n",
+ "llama_print_timings: eval time = 14375.07 ms / 75 runs ( 191.67 ms per token, 5.22 tokens per second)\n",
+ "llama_print_timings: total time = 17950.28 ms\n",
+ "Llama.generate: prefix-match hit\n",
+ "\n",
+ "llama_print_timings: load time = 3570.02 ms\n",
+ "llama_print_timings: sample time = 21.12 ms / 18 runs ( 1.17 ms per token, 852.43 tokens per second)\n",
+ "llama_print_timings: prompt eval time = 2387.83 ms / 18 tokens ( 132.66 ms per token, 7.54 tokens per second)\n",
+ "llama_print_timings: eval time = 3226.78 ms / 17 runs ( 189.81 ms per token, 5.27 tokens per second)\n",
+ "llama_print_timings: total time = 5687.06 ms\n",
+ "/home/hashem/Desktop/workspace/prompttools/prompttools/benchmarks/benchmark.py:82: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " benchmark_df[\"response_options\"] = self.response_options\n",
+ "/tmp/ipykernel_177845/406750325.py:7: UserWarning: Column 'prompt' does not exist. Using column 'messages' instead.\n",
+ " openai_chat_results = benchmarking_openai_chat.multiple_choice_benchmark()\n",
+ "/home/hashem/Desktop/workspace/prompttools/prompttools/benchmarks/benchmark.py:82: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " benchmark_df[\"response_options\"] = self.response_options\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Time taken: 0:01:51.087746\n"
+ ]
+ }
+ ],
+ "source": [
+ "start = datetime.datetime.now()\n",
+ "print(f\"Start time: {start}\")\n",
+ "\n",
+ "google_flan_t5_xxl_results = benchmarking_google_flan_t5_xxl.multiple_choice_benchmark()\n",
+ "vicuna7b_results = benchmarking_vicuna7b.multiple_choice_benchmark()\n",
+ "vicuna13b_results = benchmarking_vicuna13b.multiple_choice_benchmark()\n",
+ "openai_chat_results = benchmarking_openai_chat.multiple_choice_benchmark()\n",
+ "\n",
+ "print(\"Time taken: \", datetime.datetime.now() - start)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "c63e5335",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " google_flan_t5_xxl vicuna7b vicuna13b openai_chat\n",
+ "0 0.333333 0.333333 0.333333 0.666667 \n"
+ ]
+ }
+ ],
+ "source": [
+ "print(pd.DataFrame(\n",
+ " data=[[google_flan_t5_xxl_results, vicuna7b_results, vicuna13b_results, openai_chat_results]],\n",
+ " columns=[\"google_flan_t5_xxl\", \"vicuna7b\", \"vicuna13b\", \"openai_chat\"]))"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "eec05f12730ef3ef66f433616fcd3cfdacd3dcf1f1c49c706eaa0465be8f325b"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/prompttools/benchmarks/__init__.py b/prompttools/benchmarks/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Hegel AI, Inc.
+# All rights reserved.
+#
+# This source code's license can be found in the
+# LICENSE file in the root directory of this source tree.
+
+from .benchmark import Benchmark
+
+
+__all__ = [
+ "Benchmark",
+]