Skip to content

Commit

Permalink
Add initial Benchmarking class (#72)
Browse files Browse the repository at this point in the history
* benchmark boiler plate

* rearrange files

* benchmarking

* hellaswag test data

* cut down dataset

* cleaned up notebook

* doc string
  • Loading branch information
HashemAlsaket authored Aug 15, 2023
1 parent c3d897a commit 9a7902a
Show file tree
Hide file tree
Showing 5 changed files with 6,494 additions and 0 deletions.
385 changes: 385 additions & 0 deletions examples/notebooks/benchmark/Benchmarking.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,385 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "5e607dbf",
"metadata": {},
"source": [
"# Benchmarking Using HellaSwag Dataset"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "5d0b7ccc",
"metadata": {},
"outputs": [],
"source": [
"# ind: question index\n",
"# id: question id\n",
"# activity_label: A short phrase describing the events in the question\n",
"# ctx: The full context for the question\n",
"# ctx_a: The first sentence of the context\n",
"# ctx_b: The second sentence of the context\n",
"# dataset: Domain of the question -- e.g. activitynet / wikihow\n",
"# ending_options: A list of four ending choices"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "034294fe",
"metadata": {},
"outputs": [],
"source": [
"from prompttools.benchmarks import Benchmark\n",
"from prompttools.experiment import (\n",
" LlamaCppExperiment,\n",
" OpenAIChatExperiment,\n",
" HuggingFaceHubExperiment,\n",
")\n",
"from prompttools.utils import semantic_similarity\n",
"\n",
"import pandas as pd\n",
"import datetime\n",
"import json"
]
},
{
"cell_type": "markdown",
"id": "d0d8ac2d",
"metadata": {},
"source": [
"## Setup HellaSwag Dataset for Benchmark"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "66ecf3ca",
"metadata": {},
"outputs": [],
"source": [
"# Sample size to test\n",
"sample_size = 3\n",
"\n",
"data = []\n",
"with open('prompttools/data/benchmarking/hellaswag/hellaswag_dataset.jsonl', 'r') as file:\n",
" for line in file:\n",
" try:\n",
" json_obj = json.loads(line)\n",
" data.append([json_obj['ctx'], json_obj['ending_options']])\n",
" except json.JSONDecodeError:\n",
" print(f\"Skipped invalid JSON: {line}\")\n",
"labels = []\n",
"with open('prompttools/data/benchmarking/hellaswag/hellaswag_labels.lst', 'r') as file:\n",
" for line in file:\n",
" try:\n",
" json_obj = json.loads(line)\n",
" labels.append(json_obj)\n",
" except json.JSONDecodeError:\n",
" print(f\"Skipped invalid JSON: {line}\")\n",
"\n",
"hella_swag = pd.DataFrame(data, columns=['ctx', 'ending_options'])\n",
"hella_swag[\"labels\"] = labels\n",
"hella_swag = hella_swag.head(sample_size) # sample\n",
"\n",
"sample_ctxs = hella_swag['ctx'].values\n",
"sample_ending_options = hella_swag['ending_options'].values\n",
"sample_labels = hella_swag['labels'].values"
]
},
{
"cell_type": "markdown",
"id": "a7c1e08f",
"metadata": {},
"source": [
"## Model Params"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f0fb7b81",
"metadata": {},
"outputs": [],
"source": [
"temperatures = [0.5]"
]
},
{
"cell_type": "markdown",
"id": "1a76f2e1",
"metadata": {},
"source": [
"## Setup Experiments to Benchmark"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "669931ba",
"metadata": {},
"outputs": [],
"source": [
"# Google Flan T5 XXL\n",
"models = [\"google/flan-t5-xxl\"]\n",
"prompts = sample_ctxs\n",
"task = [\"text-generation\"]\n",
"google_flan_t5_xxl_experiment = HuggingFaceHubExperiment(\n",
" models, prompts, task, temperature=temperatures\n",
")\n",
"benchmarking_google_flan_t5_xxl = Benchmark(\n",
" experiment=google_flan_t5_xxl_experiment,\n",
" eval_method=semantic_similarity,\n",
" prompts=sample_ctxs,\n",
" response_options=sample_ending_options,\n",
" correct_response_indices=sample_labels\n",
")\n",
"\n",
"# Vicuna 7b\n",
"vicuna7b_experiment = LlamaCppExperiment(\n",
" [\n",
" \"../llama/llama.cpp/models/7b/ggml-vicuna-7b-1.1-q4_0.bin\",\n",
" ],\n",
" sample_ctxs,\n",
" call_params=dict(temperature=temperatures),\n",
")\n",
"benchmarking_vicuna7b = Benchmark(\n",
" experiment=vicuna7b_experiment,\n",
" eval_method=semantic_similarity,\n",
" prompts=sample_ctxs,\n",
" response_options=sample_ending_options,\n",
" correct_response_indices=sample_labels\n",
")\n",
"\n",
"# Vicuna 13b\n",
"vicuna13b_experiment = LlamaCppExperiment(\n",
" [\n",
" \"../llama/llama.cpp/models/13b/ggml-vic13b-uncensored-q4_0.bin\",\n",
" ],\n",
" sample_ctxs,\n",
" call_params=dict(temperature=temperatures),\n",
")\n",
"benchmarking_vicuna13b = Benchmark(\n",
" experiment=vicuna13b_experiment,\n",
" eval_method=semantic_similarity,\n",
" prompts=sample_ctxs,\n",
" response_options=sample_ending_options,\n",
" correct_response_indices=sample_labels\n",
")\n",
"\n",
"# OpenAI Chat\n",
"openai_chat_experiment = OpenAIChatExperiment(\n",
" [\"gpt-3.5-turbo\"],\n",
" [\n",
" [{\"role\": \"system\", \"content\": c}]\n",
" for c in sample_ctxs\n",
" ],\n",
" temperature=temperatures\n",
")\n",
"benchmarking_openai_chat = Benchmark(\n",
" experiment=openai_chat_experiment,\n",
" eval_method=semantic_similarity,\n",
" prompts=sample_ctxs,\n",
" response_options=sample_ending_options,\n",
" correct_response_indices=sample_labels,\n",
")"
]
},
{
"cell_type": "markdown",
"id": "3902310d",
"metadata": {},
"source": [
"## Run Benchmarks"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "9395b236",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Start time: 2023-08-14 22:13:49.096696\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"You're using a different task than the one specified in the repository. Be sure to know what you're doing :)\n",
"WARNING:huggingface_hub.inference_api:You're using a different task than the one specified in the repository. Be sure to know what you're doing :)\n",
"/home/hashem/Desktop/workspace/prompttools/prompttools/benchmarks/benchmark.py:82: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" benchmark_df[\"response_options\"] = self.response_options\n",
"llama.cpp: loading model from ../llama/llama.cpp/models/13b/ggml-vic13b-uncensored-q4_0.bin\n",
"llama_model_load_internal: format = ggjt v3 (latest)\n",
"llama_model_load_internal: n_vocab = 32000\n",
"llama_model_load_internal: n_ctx = 512\n",
"llama_model_load_internal: n_embd = 5120\n",
"llama_model_load_internal: n_mult = 256\n",
"llama_model_load_internal: n_head = 40\n",
"llama_model_load_internal: n_layer = 40\n",
"llama_model_load_internal: n_rot = 128\n",
"llama_model_load_internal: ftype = 2 (mostly Q4_0)\n",
"llama_model_load_internal: n_ff = 13824\n",
"llama_model_load_internal: model size = 13B\n",
"llama_model_load_internal: ggml ctx size = 0.09 MB\n",
"llama_model_load_internal: mem required = 9031.71 MB (+ 1608.00 MB per state)\n",
"AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n",
"llama_new_context_with_model: kv self size = 400.00 MB\n",
"\n",
"llama_print_timings: load time = 3131.00 ms\n",
"llama_print_timings: sample time = 48.04 ms / 44 runs ( 1.09 ms per token, 915.88 tokens per second)\n",
"Llama.generate: prefix-match hit\n",
"llama_print_timings: prompt eval time = 3130.95 ms / 27 tokens ( 115.96 ms per token, 8.62 tokens per second)\n",
"llama_print_timings: eval time = 8650.51 ms / 43 runs ( 201.17 ms per token, 4.97 tokens per second)\n",
"llama_print_timings: total time = 11952.95 ms\n",
"\n",
"llama_print_timings: load time = 3131.00 ms\n",
"llama_print_timings: sample time = 87.41 ms / 76 runs ( 1.15 ms per token, 869.45 tokens per second)\n",
"llama_print_timings: prompt eval time = 3275.57 ms / 25 tokens ( 131.02 ms per token, 7.63 tokens per second)\n",
"llama_print_timings: eval time = 14374.18 ms / 75 runs ( 191.66 ms per token, 5.22 tokens per second)\n",
"llama_print_timings: total time = 17955.90 ms\n",
"Llama.generate: prefix-match hit\n",
"\n",
"llama_print_timings: load time = 3131.00 ms\n",
"llama_print_timings: sample time = 21.62 ms / 18 runs ( 1.20 ms per token, 832.64 tokens per second)\n",
"llama_print_timings: prompt eval time = 2373.56 ms / 18 tokens ( 131.86 ms per token, 7.58 tokens per second)\n",
"llama_print_timings: eval time = 3258.29 ms / 17 runs ( 191.66 ms per token, 5.22 tokens per second)\n",
"llama_print_timings: total time = 5704.84 ms\n",
"/home/hashem/Desktop/workspace/prompttools/prompttools/benchmarks/benchmark.py:82: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" benchmark_df[\"response_options\"] = self.response_options\n",
"llama.cpp: loading model from ../llama/llama.cpp/models/13b/ggml-vic13b-uncensored-q4_0.bin\n",
"llama_model_load_internal: format = ggjt v3 (latest)\n",
"llama_model_load_internal: n_vocab = 32000\n",
"llama_model_load_internal: n_ctx = 512\n",
"llama_model_load_internal: n_embd = 5120\n",
"llama_model_load_internal: n_mult = 256\n",
"llama_model_load_internal: n_head = 40\n",
"llama_model_load_internal: n_layer = 40\n",
"llama_model_load_internal: n_rot = 128\n",
"llama_model_load_internal: ftype = 2 (mostly Q4_0)\n",
"llama_model_load_internal: n_ff = 13824\n",
"llama_model_load_internal: model size = 13B\n",
"llama_model_load_internal: ggml ctx size = 0.09 MB\n",
"llama_model_load_internal: mem required = 9031.71 MB (+ 1608.00 MB per state)\n",
"AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n",
"llama_new_context_with_model: kv self size = 400.00 MB\n",
"\n",
"llama_print_timings: load time = 3570.02 ms\n",
"llama_print_timings: sample time = 49.62 ms / 44 runs ( 1.13 ms per token, 886.79 tokens per second)\n",
"llama_print_timings: prompt eval time = 3569.99 ms / 27 tokens ( 132.22 ms per token, 7.56 tokens per second)\n",
"llama_print_timings: eval time = 8310.00 ms / 43 runs ( 193.26 ms per token, 5.17 tokens per second)\n",
"llama_print_timings: total time = 12056.50 ms\n",
"Llama.generate: prefix-match hit\n",
"\n",
"llama_print_timings: load time = 3570.02 ms\n",
"llama_print_timings: sample time = 87.72 ms / 76 runs ( 1.15 ms per token, 866.40 tokens per second)\n",
"llama_print_timings: prompt eval time = 3267.08 ms / 25 tokens ( 130.68 ms per token, 7.65 tokens per second)\n",
"llama_print_timings: eval time = 14375.07 ms / 75 runs ( 191.67 ms per token, 5.22 tokens per second)\n",
"llama_print_timings: total time = 17950.28 ms\n",
"Llama.generate: prefix-match hit\n",
"\n",
"llama_print_timings: load time = 3570.02 ms\n",
"llama_print_timings: sample time = 21.12 ms / 18 runs ( 1.17 ms per token, 852.43 tokens per second)\n",
"llama_print_timings: prompt eval time = 2387.83 ms / 18 tokens ( 132.66 ms per token, 7.54 tokens per second)\n",
"llama_print_timings: eval time = 3226.78 ms / 17 runs ( 189.81 ms per token, 5.27 tokens per second)\n",
"llama_print_timings: total time = 5687.06 ms\n",
"/home/hashem/Desktop/workspace/prompttools/prompttools/benchmarks/benchmark.py:82: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" benchmark_df[\"response_options\"] = self.response_options\n",
"/tmp/ipykernel_177845/406750325.py:7: UserWarning: Column 'prompt' does not exist. Using column 'messages' instead.\n",
" openai_chat_results = benchmarking_openai_chat.multiple_choice_benchmark()\n",
"/home/hashem/Desktop/workspace/prompttools/prompttools/benchmarks/benchmark.py:82: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" benchmark_df[\"response_options\"] = self.response_options\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Time taken: 0:01:51.087746\n"
]
}
],
"source": [
"start = datetime.datetime.now()\n",
"print(f\"Start time: {start}\")\n",
"\n",
"google_flan_t5_xxl_results = benchmarking_google_flan_t5_xxl.multiple_choice_benchmark()\n",
"vicuna7b_results = benchmarking_vicuna7b.multiple_choice_benchmark()\n",
"vicuna13b_results = benchmarking_vicuna13b.multiple_choice_benchmark()\n",
"openai_chat_results = benchmarking_openai_chat.multiple_choice_benchmark()\n",
"\n",
"print(\"Time taken: \", datetime.datetime.now() - start)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c63e5335",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" google_flan_t5_xxl vicuna7b vicuna13b openai_chat\n",
"0 0.333333 0.333333 0.333333 0.666667 \n"
]
}
],
"source": [
"print(pd.DataFrame(\n",
" data=[[google_flan_t5_xxl_results, vicuna7b_results, vicuna13b_results, openai_chat_results]],\n",
" columns=[\"google_flan_t5_xxl\", \"vicuna7b\", \"vicuna13b\", \"openai_chat\"]))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"vscode": {
"interpreter": {
"hash": "eec05f12730ef3ef66f433616fcd3cfdacd3dcf1f1c49c706eaa0465be8f325b"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
12 changes: 12 additions & 0 deletions prompttools/benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Copyright (c) Hegel AI, Inc.
# All rights reserved.
#
# This source code's license can be found in the
# LICENSE file in the root directory of this source tree.

from .benchmark import Benchmark


__all__ = [
"Benchmark",
]
Loading

0 comments on commit 9a7902a

Please sign in to comment.