-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmain.py
179 lines (145 loc) · 7.2 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# run with python main.py cot
import pickle
import time
import argparse
from typing import Optional
from opentom_evaluator import OpenToMEvaluatorDspy
import dspy
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, SignatureOptimizer
from dspy.evaluate.evaluate import Evaluate
from cot import CoTSimplifiedBaleen
from cot_with_thought import CoTWithThoughtSimplifiedBaleen
from get_data import default_factory, load_dataset
from collections import defaultdict
from dotenv import load_dotenv
import neptune
import numpy as np
load_dotenv()
# initialize neptune
run = neptune.init_run(
project="dspy-opentom/dspy-evaluation",
capture_hardware_metrics=False,
capture_stderr=True,
capture_stdout=True,
capture_traceback=True,
)
EVAL_QUESTION_TYPES = [
"attitude",
"multihop-fo",
"multihop-so",
"location-fo-coarse",
"location-fo-fine",
"location-so-coarse",
"location-so-fine",
]
def dump_state(data, filename):
with open(filename, "wb") as file:
pickle.dump(data, file)
def main(dspy_method, dspy_optimizer, download_dataset, question_types, teacher_lm, train_size):
# load dataset
if download_dataset:
load_dataset()
# read in the datasets pickle object
with open("datasets.pkl", "rb") as file:
datasets = pickle.load(file)
if dspy_method == "cot":
module_type = CoTSimplifiedBaleen
elif dspy_method == "cot_with_thought":
module_type = CoTWithThoughtSimplifiedBaleen
else:
raise Exception(f"Dspy method '{dspy_method}' is not valid")
modules = {}
# define modules for each question type
for question_type in question_types:
print(f"TYPE: {question_type}")
evaluator = OpenToMEvaluatorDspy(model_name="(training set) complied baleen")
if dspy_optimizer == "bootstrap_fewshot_with_random_search":
optimizer = BootstrapFewShotWithRandomSearch(
metric=evaluator.dspy_metric,
num_candidate_programs=25,
num_threads=1,
teacher_settings=dict(lm=teacher_lm),
)
compiled_baleen = optimizer.compile(module_type(), trainset=datasets[question_type]["train"][:train_size])
elif dspy_optimizer == "signature_optimizer":
optimizer = SignatureOptimizer(
metric=evaluator.dspy_metric,
breadth=10,
depth=3,
init_temperature=1.4,
verbose=True,
track_stats=True,
prompt_model=teacher_lm,
)
eval_kwargs = dict(num_threads=1, display_progress=True, display_table=0)
compiled_baleen = optimizer.compile(
module_type(),
devset=datasets[question_type]["train"][:train_size],
eval_kwargs=eval_kwargs,
)
else:
raise Exception(f"Invalid dspy optimizer type: {dspy_optimizer}")
modules[question_type] = compiled_baleen
time.sleep(10)
uncompiled_baleen = CoTSimplifiedBaleen() # regular cot is always the uncompiled baseline
print("Beginning Evaluation")
for question_type in question_types:
compiled_baleen = modules[question_type]
# Evaluation Procedure: Calculate the F1 Score for a randomly drawn batch of 50 questions 5 times and average the F1 Scores
batch_size = 50
num_batches = 5
assert len(datasets[question_type]["test"]) >= batch_size * num_batches
test = datasets[question_type]["test"][: batch_size * num_batches]
test_sets = [test[i : i + batch_size] for i in range(num_batches)]
uncompiled_f1_scores = []
compiled_f1_scores = []
for test in test_sets:
# Set up the `evaluate_on_hotpotqa` function.
evaluate_on_opentom = Evaluate(devset=test, num_threads=1, display_progress=True, display_table=0)
uncompiled_baleen_evaluator = OpenToMEvaluatorDspy(model_name="uncompiled_baleen")
evaluate_on_opentom(uncompiled_baleen, metric=uncompiled_baleen_evaluator.dspy_metric, display=True)
uncompiled_f1_scores.append(uncompiled_baleen_evaluator.f1_score()[question_type]["macro_averaged"])
compiled_baleen_evaluator = OpenToMEvaluatorDspy(model_name="compiled_baleen")
evaluate_on_opentom(compiled_baleen, metric=compiled_baleen_evaluator.dspy_metric, display=True)
compiled_f1_scores.append(compiled_baleen_evaluator.f1_score()[question_type]["macro_averaged"])
# overall f1 scores
uncompiled_mean_f1 = np.mean(uncompiled_f1_scores)
uncompiled_std_f1 = np.std(uncompiled_f1_scores)
compiled_mean_f1 = np.mean(compiled_f1_scores)
compiled_std_f1 = np.std(compiled_f1_scores)
run[f"evaluation/{question_type}/uncompiled/mean_macro_averaged_f1"] = uncompiled_mean_f1
run[f"evaluation/{question_type}/uncompiled/mean_macro_averaged_f1"] = uncompiled_std_f1
run[f"evaluation/{question_type}/compiled/mean_macro_averaged_f1"] = compiled_mean_f1
run[f"evaluation/{question_type}/compiled/mean_macro_averaged_f1"] = compiled_std_f1
print(
f"Mean Macro Averaged F1 Scores (± std dev.) - {question_type} - Aggregated from {num_batches} batches of {batch_size} questions"
)
print(f"uncompiled: {uncompiled_mean_f1:.3f} ± {uncompiled_std_f1:.3}")
print(f"compiled: {compiled_mean_f1:.3} ± {compiled_std_f1:.3}")
dump_state(modules, "cot_modules.pkl")
run["cot_modules"].upload("cot_modules.pkl")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run DSPY method.")
# dspy arguments
parser.add_argument("experiment_title", type=str, help="Title of new experiment")
parser.add_argument("dspy_method", type=str, help="The DSPY method to run")
parser.add_argument("dspy_optimizer", type=str, help="The DSPY optimizer to use")
parser.add_argument("--student", default="gpt-3.5-turbo", type=str, help="The LLM to optimize prompts for")
parser.add_argument("--teacher", default=None, type=str, help="Teacher LLM for optimizing prompts. Defaults to Student LLM")
parser.add_argument("--train_size", default=50, type=int, help="Number of training examples to use for optimization")
parser.add_argument("--download_dataset", default=True, type=bool, help="Download dataset")
parser.add_argument("--question_types", default=EVAL_QUESTION_TYPES, nargs="*", help="Question types. Defaults to all")
args = parser.parse_args()
# setup LLMs
student_lm = dspy.OpenAI(model=args.student, max_tokens=1000)
args.teacher = args.student if args.teacher is None else args.teacher
teacher_lm = dspy.OpenAI(model=args.teacher, max_tokens=1000)
dspy.settings.configure(lm=student_lm)
# validate question types
question_types = args.question_types
assert all([question_type in EVAL_QUESTION_TYPES for question_type in question_types])
args.question_types = ", ".join(question_types) # turn list into string for neptune logging
# log run parameters
run["parameters"] = args
run["sys/name"] = args.experiment_title
main(args.dspy_method, args.dspy_optimizer, args.download_dataset, question_types, teacher_lm, args.train_size)