-
Notifications
You must be signed in to change notification settings - Fork 230
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add initial Benchmarking class (#72)
* benchmark boiler plate * rearrange files * benchmarking * hellaswag test data * cut down dataset * cleaned up notebook * doc string
- Loading branch information
1 parent
c3d897a
commit 9a7902a
Showing
5 changed files
with
6,494 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,385 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "5e607dbf", | ||
"metadata": {}, | ||
"source": [ | ||
"# Benchmarking Using HellaSwag Dataset" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "5d0b7ccc", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# ind: question index\n", | ||
"# id: question id\n", | ||
"# activity_label: A short phrase describing the events in the question\n", | ||
"# ctx: The full context for the question\n", | ||
"# ctx_a: The first sentence of the context\n", | ||
"# ctx_b: The second sentence of the context\n", | ||
"# dataset: Domain of the question -- e.g. activitynet / wikihow\n", | ||
"# ending_options: A list of four ending choices" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "034294fe", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from prompttools.benchmarks import Benchmark\n", | ||
"from prompttools.experiment import (\n", | ||
" LlamaCppExperiment,\n", | ||
" OpenAIChatExperiment,\n", | ||
" HuggingFaceHubExperiment,\n", | ||
")\n", | ||
"from prompttools.utils import semantic_similarity\n", | ||
"\n", | ||
"import pandas as pd\n", | ||
"import datetime\n", | ||
"import json" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "d0d8ac2d", | ||
"metadata": {}, | ||
"source": [ | ||
"## Setup HellaSwag Dataset for Benchmark" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "66ecf3ca", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Sample size to test\n", | ||
"sample_size = 3\n", | ||
"\n", | ||
"data = []\n", | ||
"with open('prompttools/data/benchmarking/hellaswag/hellaswag_dataset.jsonl', 'r') as file:\n", | ||
" for line in file:\n", | ||
" try:\n", | ||
" json_obj = json.loads(line)\n", | ||
" data.append([json_obj['ctx'], json_obj['ending_options']])\n", | ||
" except json.JSONDecodeError:\n", | ||
" print(f\"Skipped invalid JSON: {line}\")\n", | ||
"labels = []\n", | ||
"with open('prompttools/data/benchmarking/hellaswag/hellaswag_labels.lst', 'r') as file:\n", | ||
" for line in file:\n", | ||
" try:\n", | ||
" json_obj = json.loads(line)\n", | ||
" labels.append(json_obj)\n", | ||
" except json.JSONDecodeError:\n", | ||
" print(f\"Skipped invalid JSON: {line}\")\n", | ||
"\n", | ||
"hella_swag = pd.DataFrame(data, columns=['ctx', 'ending_options'])\n", | ||
"hella_swag[\"labels\"] = labels\n", | ||
"hella_swag = hella_swag.head(sample_size) # sample\n", | ||
"\n", | ||
"sample_ctxs = hella_swag['ctx'].values\n", | ||
"sample_ending_options = hella_swag['ending_options'].values\n", | ||
"sample_labels = hella_swag['labels'].values" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "a7c1e08f", | ||
"metadata": {}, | ||
"source": [ | ||
"## Model Params" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "f0fb7b81", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"temperatures = [0.5]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "1a76f2e1", | ||
"metadata": {}, | ||
"source": [ | ||
"## Setup Experiments to Benchmark" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "669931ba", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Google Flan T5 XXL\n", | ||
"models = [\"google/flan-t5-xxl\"]\n", | ||
"prompts = sample_ctxs\n", | ||
"task = [\"text-generation\"]\n", | ||
"google_flan_t5_xxl_experiment = HuggingFaceHubExperiment(\n", | ||
" models, prompts, task, temperature=temperatures\n", | ||
")\n", | ||
"benchmarking_google_flan_t5_xxl = Benchmark(\n", | ||
" experiment=google_flan_t5_xxl_experiment,\n", | ||
" eval_method=semantic_similarity,\n", | ||
" prompts=sample_ctxs,\n", | ||
" response_options=sample_ending_options,\n", | ||
" correct_response_indices=sample_labels\n", | ||
")\n", | ||
"\n", | ||
"# Vicuna 7b\n", | ||
"vicuna7b_experiment = LlamaCppExperiment(\n", | ||
" [\n", | ||
" \"../llama/llama.cpp/models/7b/ggml-vicuna-7b-1.1-q4_0.bin\",\n", | ||
" ],\n", | ||
" sample_ctxs,\n", | ||
" call_params=dict(temperature=temperatures),\n", | ||
")\n", | ||
"benchmarking_vicuna7b = Benchmark(\n", | ||
" experiment=vicuna7b_experiment,\n", | ||
" eval_method=semantic_similarity,\n", | ||
" prompts=sample_ctxs,\n", | ||
" response_options=sample_ending_options,\n", | ||
" correct_response_indices=sample_labels\n", | ||
")\n", | ||
"\n", | ||
"# Vicuna 13b\n", | ||
"vicuna13b_experiment = LlamaCppExperiment(\n", | ||
" [\n", | ||
" \"../llama/llama.cpp/models/13b/ggml-vic13b-uncensored-q4_0.bin\",\n", | ||
" ],\n", | ||
" sample_ctxs,\n", | ||
" call_params=dict(temperature=temperatures),\n", | ||
")\n", | ||
"benchmarking_vicuna13b = Benchmark(\n", | ||
" experiment=vicuna13b_experiment,\n", | ||
" eval_method=semantic_similarity,\n", | ||
" prompts=sample_ctxs,\n", | ||
" response_options=sample_ending_options,\n", | ||
" correct_response_indices=sample_labels\n", | ||
")\n", | ||
"\n", | ||
"# OpenAI Chat\n", | ||
"openai_chat_experiment = OpenAIChatExperiment(\n", | ||
" [\"gpt-3.5-turbo\"],\n", | ||
" [\n", | ||
" [{\"role\": \"system\", \"content\": c}]\n", | ||
" for c in sample_ctxs\n", | ||
" ],\n", | ||
" temperature=temperatures\n", | ||
")\n", | ||
"benchmarking_openai_chat = Benchmark(\n", | ||
" experiment=openai_chat_experiment,\n", | ||
" eval_method=semantic_similarity,\n", | ||
" prompts=sample_ctxs,\n", | ||
" response_options=sample_ending_options,\n", | ||
" correct_response_indices=sample_labels,\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "3902310d", | ||
"metadata": {}, | ||
"source": [ | ||
"## Run Benchmarks" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "9395b236", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Start time: 2023-08-14 22:13:49.096696\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"You're using a different task than the one specified in the repository. Be sure to know what you're doing :)\n", | ||
"WARNING:huggingface_hub.inference_api:You're using a different task than the one specified in the repository. Be sure to know what you're doing :)\n", | ||
"/home/hashem/Desktop/workspace/prompttools/prompttools/benchmarks/benchmark.py:82: SettingWithCopyWarning: \n", | ||
"A value is trying to be set on a copy of a slice from a DataFrame.\n", | ||
"Try using .loc[row_indexer,col_indexer] = value instead\n", | ||
"\n", | ||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | ||
" benchmark_df[\"response_options\"] = self.response_options\n", | ||
"llama.cpp: loading model from ../llama/llama.cpp/models/13b/ggml-vic13b-uncensored-q4_0.bin\n", | ||
"llama_model_load_internal: format = ggjt v3 (latest)\n", | ||
"llama_model_load_internal: n_vocab = 32000\n", | ||
"llama_model_load_internal: n_ctx = 512\n", | ||
"llama_model_load_internal: n_embd = 5120\n", | ||
"llama_model_load_internal: n_mult = 256\n", | ||
"llama_model_load_internal: n_head = 40\n", | ||
"llama_model_load_internal: n_layer = 40\n", | ||
"llama_model_load_internal: n_rot = 128\n", | ||
"llama_model_load_internal: ftype = 2 (mostly Q4_0)\n", | ||
"llama_model_load_internal: n_ff = 13824\n", | ||
"llama_model_load_internal: model size = 13B\n", | ||
"llama_model_load_internal: ggml ctx size = 0.09 MB\n", | ||
"llama_model_load_internal: mem required = 9031.71 MB (+ 1608.00 MB per state)\n", | ||
"AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n", | ||
"llama_new_context_with_model: kv self size = 400.00 MB\n", | ||
"\n", | ||
"llama_print_timings: load time = 3131.00 ms\n", | ||
"llama_print_timings: sample time = 48.04 ms / 44 runs ( 1.09 ms per token, 915.88 tokens per second)\n", | ||
"Llama.generate: prefix-match hit\n", | ||
"llama_print_timings: prompt eval time = 3130.95 ms / 27 tokens ( 115.96 ms per token, 8.62 tokens per second)\n", | ||
"llama_print_timings: eval time = 8650.51 ms / 43 runs ( 201.17 ms per token, 4.97 tokens per second)\n", | ||
"llama_print_timings: total time = 11952.95 ms\n", | ||
"\n", | ||
"llama_print_timings: load time = 3131.00 ms\n", | ||
"llama_print_timings: sample time = 87.41 ms / 76 runs ( 1.15 ms per token, 869.45 tokens per second)\n", | ||
"llama_print_timings: prompt eval time = 3275.57 ms / 25 tokens ( 131.02 ms per token, 7.63 tokens per second)\n", | ||
"llama_print_timings: eval time = 14374.18 ms / 75 runs ( 191.66 ms per token, 5.22 tokens per second)\n", | ||
"llama_print_timings: total time = 17955.90 ms\n", | ||
"Llama.generate: prefix-match hit\n", | ||
"\n", | ||
"llama_print_timings: load time = 3131.00 ms\n", | ||
"llama_print_timings: sample time = 21.62 ms / 18 runs ( 1.20 ms per token, 832.64 tokens per second)\n", | ||
"llama_print_timings: prompt eval time = 2373.56 ms / 18 tokens ( 131.86 ms per token, 7.58 tokens per second)\n", | ||
"llama_print_timings: eval time = 3258.29 ms / 17 runs ( 191.66 ms per token, 5.22 tokens per second)\n", | ||
"llama_print_timings: total time = 5704.84 ms\n", | ||
"/home/hashem/Desktop/workspace/prompttools/prompttools/benchmarks/benchmark.py:82: SettingWithCopyWarning: \n", | ||
"A value is trying to be set on a copy of a slice from a DataFrame.\n", | ||
"Try using .loc[row_indexer,col_indexer] = value instead\n", | ||
"\n", | ||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | ||
" benchmark_df[\"response_options\"] = self.response_options\n", | ||
"llama.cpp: loading model from ../llama/llama.cpp/models/13b/ggml-vic13b-uncensored-q4_0.bin\n", | ||
"llama_model_load_internal: format = ggjt v3 (latest)\n", | ||
"llama_model_load_internal: n_vocab = 32000\n", | ||
"llama_model_load_internal: n_ctx = 512\n", | ||
"llama_model_load_internal: n_embd = 5120\n", | ||
"llama_model_load_internal: n_mult = 256\n", | ||
"llama_model_load_internal: n_head = 40\n", | ||
"llama_model_load_internal: n_layer = 40\n", | ||
"llama_model_load_internal: n_rot = 128\n", | ||
"llama_model_load_internal: ftype = 2 (mostly Q4_0)\n", | ||
"llama_model_load_internal: n_ff = 13824\n", | ||
"llama_model_load_internal: model size = 13B\n", | ||
"llama_model_load_internal: ggml ctx size = 0.09 MB\n", | ||
"llama_model_load_internal: mem required = 9031.71 MB (+ 1608.00 MB per state)\n", | ||
"AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | \n", | ||
"llama_new_context_with_model: kv self size = 400.00 MB\n", | ||
"\n", | ||
"llama_print_timings: load time = 3570.02 ms\n", | ||
"llama_print_timings: sample time = 49.62 ms / 44 runs ( 1.13 ms per token, 886.79 tokens per second)\n", | ||
"llama_print_timings: prompt eval time = 3569.99 ms / 27 tokens ( 132.22 ms per token, 7.56 tokens per second)\n", | ||
"llama_print_timings: eval time = 8310.00 ms / 43 runs ( 193.26 ms per token, 5.17 tokens per second)\n", | ||
"llama_print_timings: total time = 12056.50 ms\n", | ||
"Llama.generate: prefix-match hit\n", | ||
"\n", | ||
"llama_print_timings: load time = 3570.02 ms\n", | ||
"llama_print_timings: sample time = 87.72 ms / 76 runs ( 1.15 ms per token, 866.40 tokens per second)\n", | ||
"llama_print_timings: prompt eval time = 3267.08 ms / 25 tokens ( 130.68 ms per token, 7.65 tokens per second)\n", | ||
"llama_print_timings: eval time = 14375.07 ms / 75 runs ( 191.67 ms per token, 5.22 tokens per second)\n", | ||
"llama_print_timings: total time = 17950.28 ms\n", | ||
"Llama.generate: prefix-match hit\n", | ||
"\n", | ||
"llama_print_timings: load time = 3570.02 ms\n", | ||
"llama_print_timings: sample time = 21.12 ms / 18 runs ( 1.17 ms per token, 852.43 tokens per second)\n", | ||
"llama_print_timings: prompt eval time = 2387.83 ms / 18 tokens ( 132.66 ms per token, 7.54 tokens per second)\n", | ||
"llama_print_timings: eval time = 3226.78 ms / 17 runs ( 189.81 ms per token, 5.27 tokens per second)\n", | ||
"llama_print_timings: total time = 5687.06 ms\n", | ||
"/home/hashem/Desktop/workspace/prompttools/prompttools/benchmarks/benchmark.py:82: SettingWithCopyWarning: \n", | ||
"A value is trying to be set on a copy of a slice from a DataFrame.\n", | ||
"Try using .loc[row_indexer,col_indexer] = value instead\n", | ||
"\n", | ||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | ||
" benchmark_df[\"response_options\"] = self.response_options\n", | ||
"/tmp/ipykernel_177845/406750325.py:7: UserWarning: Column 'prompt' does not exist. Using column 'messages' instead.\n", | ||
" openai_chat_results = benchmarking_openai_chat.multiple_choice_benchmark()\n", | ||
"/home/hashem/Desktop/workspace/prompttools/prompttools/benchmarks/benchmark.py:82: SettingWithCopyWarning: \n", | ||
"A value is trying to be set on a copy of a slice from a DataFrame.\n", | ||
"Try using .loc[row_indexer,col_indexer] = value instead\n", | ||
"\n", | ||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", | ||
" benchmark_df[\"response_options\"] = self.response_options\n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Time taken: 0:01:51.087746\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"start = datetime.datetime.now()\n", | ||
"print(f\"Start time: {start}\")\n", | ||
"\n", | ||
"google_flan_t5_xxl_results = benchmarking_google_flan_t5_xxl.multiple_choice_benchmark()\n", | ||
"vicuna7b_results = benchmarking_vicuna7b.multiple_choice_benchmark()\n", | ||
"vicuna13b_results = benchmarking_vicuna13b.multiple_choice_benchmark()\n", | ||
"openai_chat_results = benchmarking_openai_chat.multiple_choice_benchmark()\n", | ||
"\n", | ||
"print(\"Time taken: \", datetime.datetime.now() - start)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"id": "c63e5335", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
" google_flan_t5_xxl vicuna7b vicuna13b openai_chat\n", | ||
"0 0.333333 0.333333 0.333333 0.666667 \n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(pd.DataFrame(\n", | ||
" data=[[google_flan_t5_xxl_results, vicuna7b_results, vicuna13b_results, openai_chat_results]],\n", | ||
" columns=[\"google_flan_t5_xxl\", \"vicuna7b\", \"vicuna13b\", \"openai_chat\"]))" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.12" | ||
}, | ||
"vscode": { | ||
"interpreter": { | ||
"hash": "eec05f12730ef3ef66f433616fcd3cfdacd3dcf1f1c49c706eaa0465be8f325b" | ||
} | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# Copyright (c) Hegel AI, Inc. | ||
# All rights reserved. | ||
# | ||
# This source code's license can be found in the | ||
# LICENSE file in the root directory of this source tree. | ||
|
||
from .benchmark import Benchmark | ||
|
||
|
||
__all__ = [ | ||
"Benchmark", | ||
] |
Oops, something went wrong.