From e2439e5ef9244d6ec77184afb57d59b3137d08cf Mon Sep 17 00:00:00 2001 From: "Yu, Zhentao" Date: Mon, 3 Jun 2024 06:10:06 +0000 Subject: [PATCH] fix ret when ignore_prompt Signed-off-by: Yu, Zhentao --- docs/continuous_batching.md | 47 +++++++++++++++++++++++++++++++++++-- neural_speed/__init__.py | 2 +- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/docs/continuous_batching.md b/docs/continuous_batching.md index 1e59e6549..b09dab6bc 100644 --- a/docs/continuous_batching.md +++ b/docs/continuous_batching.md @@ -16,7 +16,7 @@ We only support multi-batch inference in concatenating & splitting input sequenc The code example is like: ```python -from transformers import AutoTokenizer, AutoModelForCausalLM +from transformers import AutoTokenizer from neural_speed import Model model_name = "meta-llama/Llama-2-7b-hf" @@ -32,7 +32,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, pa # if the tokenizer has no pad_token, you can specify it. tokenizer.pad_token = tokenizer.eos_token pad_token_id = tokenizer.pad_token_id -inputs = tokenizer(ps, padding=True, return_tensors='pt').input_ids +inputs = tokenizer(prompts, padding=True, return_tensors='pt').input_ids model = Model() model.init(model_name, use_quant=True, weight_dtype="int4", compute_dtype="int8") @@ -46,6 +46,49 @@ for a in ans: ``` > Note: Not every model supports multi-batching inference and most of them are under construction, please refer to [Supported Models](#supported-models). +You can use below codes to get the `token/second` metric if you care about the throughput of batching inference. +```python +from transformers import AutoTokenizer +from neural_speed import Model + +model_name = "meta-llama/Llama-2-7b-hf" +prompts = [ + "Tell me an interesting fact about llamas.", + "What is the best way to cook a steak?", + "Are you familiar with the Special Theory of Relativity and can you explain it to me?", + "Recommend some interesting books to read.", + "What is the best way to learn a new language?", + ] + +tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side="left") +# if the tokenizer has no pad_token, you can specify it. +tokenizer.pad_token = tokenizer.eos_token +pad_token_id = tokenizer.pad_token_id +inputs = tokenizer(prompts, padding=True, return_tensors='pt').input_ids + +model = Model() +model.init(model_name, use_quant=True, weight_dtype="int4", compute_dtype="int8") +# greedy search example, top_k_top_p sampling and beam_search also supported +# do not forget to pass pad_token_id +# warmup +outputs = model.generate(inputs, + max_new_tokens=4, + do_sample=False, + pad_token=pad_token_id, + ignore_prompt=True, + max_request_num=bs) +t0 = time.time() +outputs = model.generate(inputs, + max_new_tokens=128, + do_sample=False, + pad_token=pad_token_id, + ignore_prompt=True, + max_request_num=bs) +duration = time.time() - t0 +total_tokens = sum([len(a) for a in outputs]) +print("throughput is {} token/second.".format(total_tokens / duration)) +``` + ## Server We supply a corresponding [script](../scripts/python_api_example_for_model_server.py) for server usage. You can modify the `max_request_num` for setting the maximum bearable requests. diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py index 1130c13a4..9dfcdb653 100644 --- a/neural_speed/__init__.py +++ b/neural_speed/__init__.py @@ -361,7 +361,7 @@ def generate(self, self.model.reinit() self.generate_round = 0 - ret = [[]] + ret = [[] for _ in range(input_ids.shape[0])] if self.generate_round == 0 and not ignore_prompt: ret = input_ids.tolist()