Run multiple inferences with different prompt lengths

triton-inference-server · Sep 28, 2023 · db14cda · db14cda
1 parent 85e2d83
commit db14cda
Showing 1 changed file with 64 additions and 22 deletions.
diff --git a/src/c++/perf_analyzer/docs/examples/profile.py b/src/c++/perf_analyzer/docs/examples/profile.py
@@ -24,44 +24,86 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import argparse
 import json
+import random
 import subprocess
 from pathlib import Path
 
-if __name__ == "__main__":
-    # Clean up
-    export_file = Path("profile_export.json")
-    export_file.unlink(missing_ok=True)
+RANDOM_WORDS = [
+    "system",
+    "plug",
+    "gentle",
+    "efficient",
+    "library",
+    "tested",
+    "careful",
+    "sneeze",
+    "excuse",
+    "zoo",
+    "rock",
+    "delight",
+    "hammer",
+    "unit",
+    "happen",
+    "multiply",
+    "texture",
+    "tired",
+    "knot",
+    "yawn",
+]
 
-    with open("prompts.json", "w") as f:
-        json.dump(
-            {
-                "data": [
-                    {
-                        "PROMPT": ["Hello, my name is "],
-                        "STREAM": [True],
-                    }
-                ],
-            },
-            f,
-        )
 
-    # Run Perf Analyzer
+def profile(args):
     command = (
-        "perf_analyzer -m vllm -i grpc --async --streaming "
+        f"perf_analyzer -m {args.model} -i grpc --async --streaming "
         "--input-data=prompts.json "
         "--profile-export-file=profile_export.json "
         "--measurement-mode=count_windows "
         "--measurement-request-count=10 "
         "--stability-percentage=999"
     )
     ret = subprocess.run(args=[command], shell=True)
-
     if ret.returncode == 0:
-        # example json demonstrating format:
-        # https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/perf_analyzer/docs/examples/decoupled_output_file.json
+        # Example json demonstrating format:
+        #   see client/src/c++/perf_analyzer/docs/examples/decoupled_output_file.json
         with open("profile_export.json") as f:
             requests = json.load(f)["experiments"][0]["requests"]
             latencies = [r["response_timestamps"][0] - r["timestamp"] for r in requests]
             avg_latency_s = sum(latencies) / len(latencies) / 1_000_000_000
-            print(f"Average first-token latency: {avg_latency_s} sec")
+        return avg_latency_s
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-m",
+        "--model",
+        type=str,
+        default="vllm",
+        help="The name of the model to profile.",
+    )
+    args = parser.parse_args()
+
+    prompt_lengths = [10, 100, 500, 800, 1000]
+    input_data = {"data": [{"STREAM": [True]}]}
+    results = []
+
+    for prompt_length in prompt_lengths:
+        # Generate random prompt
+        prompt = random.choices(RANDOM_WORDS, k=prompt_length)
+        input_data["data"][0]["PROMPT"] = [" ".join(prompt)]
+        with open("prompts.json", "w") as f:
+            json.dump(input_data, f)
+
+        # Clean up
+        export_file = Path("profile_export.json")
+        export_file.unlink(missing_ok=True)
+
+        results.append(profile(args))
+
+    print("[ Summary: First-Token Latency ]")
+    for prompt_length, latency in zip(prompt_lengths, results):
+        print(
+            f"- Prompt Length: {prompt_length} | Average first-token latency: {latency}"
+        )