diff --git a/byte_infer_perf/llm_perf/requirements.txt b/byte_infer_perf/llm_perf/requirements.txt index 7180d844..4f522610 100644 --- a/byte_infer_perf/llm_perf/requirements.txt +++ b/byte_infer_perf/llm_perf/requirements.txt @@ -5,7 +5,9 @@ isort sentencepiece pandas google-api-python-client -transformers==4.33.2 +transformers==4.40.0 tqdm matplotlib -backoff \ No newline at end of file +backoff +psutil +accelerate \ No newline at end of file diff --git a/byte_infer_perf/llm_perf/server/endpoint.py b/byte_infer_perf/llm_perf/server/endpoint.py index c33d078a..091dd76d 100644 --- a/byte_infer_perf/llm_perf/server/endpoint.py +++ b/byte_infer_perf/llm_perf/server/endpoint.py @@ -140,6 +140,8 @@ async def streaming_inference( prompt_tokens = len(req.input_ids) completion_tokens = 0 + tokens_buffer = [] + async for gen_res in self.scheduler.generate(req): result = gen_res["result"] if result is not None: @@ -157,7 +159,14 @@ async def streaming_inference( } if result is not None: - text = self.tokenizer.decode([result.token_id], skip_special_tokens=True, clean_up_tokenization_spaces=True) + tokens_buffer.append(result.token_id) + + text = self.tokenizer.decode(tokens_buffer, skip_special_tokens=True, clean_up_tokenization_spaces=True) + if text == " �" or text == "�": + text = "" + else: + tokens_buffer = [] + infer_outputs["choice"].update( { "message": text,