Skip to content

Commit

Permalink
Fix per token latency (#223)
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyasMoutawwakil authored Jul 3, 2024
1 parent 19eeac5 commit 2a75c0b
Showing 1 changed file with 12 additions and 10 deletions.
22 changes: 12 additions & 10 deletions optimum_benchmark/trackers/latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def __init__(self, device: str, backend: str):
LOGGER.info("\t+ Tracking latency using CPU performance counter")

self.start_time: Optional[float] = None
self.next_is_prefill_end_decode_start: Optional[bool] = None
self.prefilled: Optional[bool] = None

self.per_token_events: List[Union[float, torch.cuda.Event]] = []
self.prefill_start_events: List[Union[float, torch.cuda.Event]] = []
Expand All @@ -272,7 +272,7 @@ def __init__(self, device: str, backend: str):

def reset(self):
self.start_time = None
self.next_is_prefill_end_decode_start = None
self.prefilled = None

self.per_token_events = []
self.prefill_start_events = []
Expand All @@ -291,11 +291,13 @@ def track(self):
else:
self.prefill_start_events.append(time.perf_counter())

self.next_is_prefill_end_decode_start = True # this is used to record the end of prefill and start of decode
self.prefilled = False

yield # this is where generate is called, and for each decoded token, we record an event
# this is where generate is called,
# and for each decoded token, we record an event
yield

self.next_is_prefill_end_decode_start = None
self.prefilled = None

if self.is_asynchronous:
self.decode_end_events.append(torch.cuda.Event(enable_timing=True))
Expand All @@ -308,7 +310,7 @@ def track(self):

def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
assert (
self.next_is_prefill_end_decode_start is not None
self.prefilled is not None
), "PerTokenLatencyLogitsProcessor should only be called inside of track() context"

if self.is_asynchronous:
Expand All @@ -317,12 +319,12 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
else:
event = time.perf_counter()

if self.next_is_prefill_end_decode_start:
self.per_token_events.append(event)

if not self.prefilled:
self.prefill_end_events.append(event)
self.decode_start_events.append(event)
self.next_is_prefill_end_decode_start = False
else:
self.per_token_events.append(event)
self.prefilled = True

return scores

Expand Down

0 comments on commit 2a75c0b

Please sign in to comment.