From db14cda780508d1cd2a7a7ad3cfd37c7b1872457 Mon Sep 17 00:00:00 2001 From: Hyunjae Woo Date: Thu, 28 Sep 2023 16:10:31 -0700 Subject: [PATCH] Run multiple inferences with different prompt lengths --- .../perf_analyzer/docs/examples/profile.py | 86 ++++++++++++++----- 1 file changed, 64 insertions(+), 22 deletions(-) diff --git a/src/c++/perf_analyzer/docs/examples/profile.py b/src/c++/perf_analyzer/docs/examples/profile.py index b4b339900..a5d270b9e 100644 --- a/src/c++/perf_analyzer/docs/examples/profile.py +++ b/src/c++/perf_analyzer/docs/examples/profile.py @@ -24,31 +24,39 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import argparse import json +import random import subprocess from pathlib import Path -if __name__ == "__main__": - # Clean up - export_file = Path("profile_export.json") - export_file.unlink(missing_ok=True) +RANDOM_WORDS = [ + "system", + "plug", + "gentle", + "efficient", + "library", + "tested", + "careful", + "sneeze", + "excuse", + "zoo", + "rock", + "delight", + "hammer", + "unit", + "happen", + "multiply", + "texture", + "tired", + "knot", + "yawn", +] - with open("prompts.json", "w") as f: - json.dump( - { - "data": [ - { - "PROMPT": ["Hello, my name is "], - "STREAM": [True], - } - ], - }, - f, - ) - # Run Perf Analyzer +def profile(args): command = ( - "perf_analyzer -m vllm -i grpc --async --streaming " + f"perf_analyzer -m {args.model} -i grpc --async --streaming " "--input-data=prompts.json " "--profile-export-file=profile_export.json " "--measurement-mode=count_windows " @@ -56,12 +64,46 @@ "--stability-percentage=999" ) ret = subprocess.run(args=[command], shell=True) - if ret.returncode == 0: - # example json demonstrating format: - # https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/perf_analyzer/docs/examples/decoupled_output_file.json + # Example json demonstrating format: + # see client/src/c++/perf_analyzer/docs/examples/decoupled_output_file.json with open("profile_export.json") as f: requests = json.load(f)["experiments"][0]["requests"] latencies = [r["response_timestamps"][0] - r["timestamp"] for r in requests] avg_latency_s = sum(latencies) / len(latencies) / 1_000_000_000 - print(f"Average first-token latency: {avg_latency_s} sec") + return avg_latency_s + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-m", + "--model", + type=str, + default="vllm", + help="The name of the model to profile.", + ) + args = parser.parse_args() + + prompt_lengths = [10, 100, 500, 800, 1000] + input_data = {"data": [{"STREAM": [True]}]} + results = [] + + for prompt_length in prompt_lengths: + # Generate random prompt + prompt = random.choices(RANDOM_WORDS, k=prompt_length) + input_data["data"][0]["PROMPT"] = [" ".join(prompt)] + with open("prompts.json", "w") as f: + json.dump(input_data, f) + + # Clean up + export_file = Path("profile_export.json") + export_file.unlink(missing_ok=True) + + results.append(profile(args)) + + print("[ Summary: First-Token Latency ]") + for prompt_length, latency in zip(prompt_lengths, results): + print( + f"- Prompt Length: {prompt_length} | Average first-token latency: {latency}" + )