-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbatching_experiment.py
56 lines (48 loc) · 1.7 KB
/
batching_experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from transformers import LlamaForCausalLM
import torch
import time
import numpy as np
import pickle
import matplotlib.pyplot as plt
from tqdm import tqdm
torch.set_num_threads(1)
def run_batching_experiment(model, batch_size, length):
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
scenario_times = []
for _ in range(100):
data = torch.randint(0, 32000, (batch_size, length)).to(device)
with torch.no_grad():
start_time = time.time()
_ = model(input_ids=data)
elapsed = time.time() - start_time
scenario_times.append(elapsed)
avg_scenario_time = np.mean(scenario_times)
std_dev = np.std(scenario_times)
return avg_scenario_time, std_dev
if __name__ == "__main__":
model_path = "princeton-nlp/Sheared-LLaMA-1.3B"
model = LlamaForCausalLM.from_pretrained(
model_path,
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
low_cpu_mem_usage=True,
device_map="auto",
)
LENGTHS = [50, 100, 200, 400]
BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64]
data = {}
for length in tqdm(LENGTHS):
data[length] = [run_batching_experiment(model, bs, length) for bs in BATCH_SIZES]
print(f"Length: {length}", data[length])
for length in LENGTHS:
d = data[length]
plt.plot(BATCH_SIZES, [d[0] for d in data[length]], label=f"D={length}")
plt.fill_between(
BATCH_SIZES, [d[0] - d[1] for d in data[length]], [d[0] + d[1] for d in data[length]], alpha=0.2
)
plt.legend()
plt.xlabel("Batch size")
plt.xscale("log", base=2)
plt.ylabel("TTFT (s)")
plt.savefig("batch_size_figure.png")