Skip to content

Commit

Permalink
Run multiple inferences with different prompt lengths
Browse files Browse the repository at this point in the history
  • Loading branch information
nv-hwoo committed Sep 28, 2023
1 parent 85e2d83 commit db14cda
Showing 1 changed file with 64 additions and 22 deletions.
86 changes: 64 additions & 22 deletions src/c++/perf_analyzer/docs/examples/profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,44 +24,86 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import argparse
import json
import random
import subprocess
from pathlib import Path

if __name__ == "__main__":
# Clean up
export_file = Path("profile_export.json")
export_file.unlink(missing_ok=True)
RANDOM_WORDS = [
"system",
"plug",
"gentle",
"efficient",
"library",
"tested",
"careful",
"sneeze",
"excuse",
"zoo",
"rock",
"delight",
"hammer",
"unit",
"happen",
"multiply",
"texture",
"tired",
"knot",
"yawn",
]

with open("prompts.json", "w") as f:
json.dump(
{
"data": [
{
"PROMPT": ["Hello, my name is "],
"STREAM": [True],
}
],
},
f,
)

# Run Perf Analyzer
def profile(args):

Check notice

Code scanning / CodeQL

Explicit returns mixed with implicit (fall through) returns Note documentation

Mixing implicit and explicit returns may indicate an error as implicit returns always return None.
command = (
"perf_analyzer -m vllm -i grpc --async --streaming "
f"perf_analyzer -m {args.model} -i grpc --async --streaming "
"--input-data=prompts.json "
"--profile-export-file=profile_export.json "
"--measurement-mode=count_windows "
"--measurement-request-count=10 "
"--stability-percentage=999"
)
ret = subprocess.run(args=[command], shell=True)

if ret.returncode == 0:
# example json demonstrating format:
# https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/perf_analyzer/docs/examples/decoupled_output_file.json
# Example json demonstrating format:
# see client/src/c++/perf_analyzer/docs/examples/decoupled_output_file.json
with open("profile_export.json") as f:
requests = json.load(f)["experiments"][0]["requests"]
latencies = [r["response_timestamps"][0] - r["timestamp"] for r in requests]
avg_latency_s = sum(latencies) / len(latencies) / 1_000_000_000
print(f"Average first-token latency: {avg_latency_s} sec")
return avg_latency_s


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-m",
"--model",
type=str,
default="vllm",
help="The name of the model to profile.",
)
args = parser.parse_args()

prompt_lengths = [10, 100, 500, 800, 1000]
input_data = {"data": [{"STREAM": [True]}]}
results = []

for prompt_length in prompt_lengths:
# Generate random prompt
prompt = random.choices(RANDOM_WORDS, k=prompt_length)
input_data["data"][0]["PROMPT"] = [" ".join(prompt)]
with open("prompts.json", "w") as f:
json.dump(input_data, f)

# Clean up
export_file = Path("profile_export.json")
export_file.unlink(missing_ok=True)

results.append(profile(args))

print("[ Summary: First-Token Latency ]")
for prompt_length, latency in zip(prompt_lengths, results):
print(
f"- Prompt Length: {prompt_length} | Average first-token latency: {latency}"
)

0 comments on commit db14cda

Please sign in to comment.