From 283aa5de2dcd370c07bc35bb1089ce0f5c12d480 Mon Sep 17 00:00:00 2001 From: jiangzishan Date: Thu, 10 Oct 2024 20:03:50 +0800 Subject: [PATCH] add tensorrt-llm benchmark script. --- .../llm_perf/benchmark/tensorrt-llm/README.md | 50 ++++ .../benchmark/tensorrt-llm/bench_engine.py | 256 ++++++++++++++++++ 2 files changed, 306 insertions(+) create mode 100644 byte_infer_perf/llm_perf/benchmark/tensorrt-llm/README.md create mode 100644 byte_infer_perf/llm_perf/benchmark/tensorrt-llm/bench_engine.py diff --git a/byte_infer_perf/llm_perf/benchmark/tensorrt-llm/README.md b/byte_infer_perf/llm_perf/benchmark/tensorrt-llm/README.md new file mode 100644 index 00000000..6d887114 --- /dev/null +++ b/byte_infer_perf/llm_perf/benchmark/tensorrt-llm/README.md @@ -0,0 +1,50 @@ +# TensorRT-LLM Benchmark + + +## installation +Refer to [TensorRT-LLM](https://nvidia.github.io/TensorRT-LLM/installation/linux.html) for details. + +Follwing envs have been tested: +- Docker image: nvcr.io/nvidia/cuda:12.5.1-devel-ubuntu20.04 +- TensorRT-LLM: 0.13.0 +- TensorRT: 10.4.0.26 + +## build engine and test +mixtral-8x22b, tp_size=8, moe_tp_size=8, dtype=float16 +``` +cd TensorRT-LLM/examples/mixtral + +// convert model +python3 ../llama/convert_checkpoint.py \ + --model_dir ./mixtral-8x22b \ + --output_dir ./tllm_checkpoint_mixtral_8gpu \ + --dtype float16 \ + --tp_size 8 + +// build engine +trtllm-build \ + --checkpoint_dir ./tllm_checkpoint_mixtral_8gpu \ + --output_dir ./trt_engines/mixtral/tp8 \ + --max_batch_size 256 \ + --max_input_len 17408 \ + --max_seq_len 17408 \ + --max_num_tokens 17408 + +// run engine with given prompt +mpirun --allow-run-as-root -n 8 python3 ../run.py \ + --engine_dir ./trt_engines/mixtral/tp8/ \ + --tokenizer_dir ./mixtral-8x22b \ + --max_output_len 100 \ + --input_text "7 years ago, I was 6 times older than my son. My son is 12 years old now. How old am I now?" + +// benchmark engine +python3 bench_engine.py \ + --engine_dir ./trt_engines/mixtral/tp8/ \ + --model_dir ./mixtral-8x22b \ + --batch_size_list 1,2,4,8,16,32,40,48,56,64,72,80,88,96,104,112,120,128 \ + --seq_len_list 1024,2048,4096,6144,8192 +``` + + + + diff --git a/byte_infer_perf/llm_perf/benchmark/tensorrt-llm/bench_engine.py b/byte_infer_perf/llm_perf/benchmark/tensorrt-llm/bench_engine.py new file mode 100644 index 00000000..dbc970b0 --- /dev/null +++ b/byte_infer_perf/llm_perf/benchmark/tensorrt-llm/bench_engine.py @@ -0,0 +1,256 @@ +import os +import sys +import pathlib +import argparse +import logging +import json +import subprocess + +CUR_DIR = pathlib.Path.cwd() +FILE_DIR = pathlib.Path(__file__).parent.absolute() + +logger = logging.getLogger("bench_trtllm") + +def setup_logger(loglevel: str): + fmt = logging.Formatter( + fmt="%(asctime)s.%(msecs)03d %(filename)s:%(lineno)d [%(levelname)s]: %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + handler = logging.StreamHandler(stream=sys.stdout) + handler.setFormatter(fmt) + logger.addHandler(handler) + logger.setLevel(loglevel.upper()) + logger.propagate = False + + + + +def parse_args(): + parser = argparse.ArgumentParser() + + # tensorrt-llm project path + parser.add_argument("--trtllm_dir", type=str) + + # model engine + parser.add_argument("--engine_dir", type=str, required=True) + parser.add_argument("--model_dir", type=str, required=True) + + # perf config + parser.add_argument("--batch_size_list", type=str, help="batch_size list, split by comma, \"1,2,4,8,16,32\"") + parser.add_argument("--seq_len_list", type=str, help="seq_len list, split by comma, \"1024,2048,4096,8192\"") + + # workspace + parser.add_argument("--workspace", type=str, default=str(CUR_DIR.joinpath("workspace"))) + + # logging + parser.add_argument("--loglevel", type=str, default="INFO") + + args = parser.parse_args() + + setup_logger(args.loglevel) + + + # check trtllm + if args.trtllm_dir is None and os.getenv("TRTLLM_PATH") is None: + logger.error("trtllm_dir is None, please set trtllm_dir or set TRTLLM_PATH in env") + sys.exit(-1) + trtllm_dir = pathlib.Path(args.trtllm_dir) if args.trtllm_dir is not None else pathlib.Path(os.getenv("TRTLLM_PATH")).absolute() + benchmark_build_dir = trtllm_dir.joinpath("cpp", "build", "benchmarks") + session_benchmark = benchmark_build_dir.joinpath("gptSessionBenchmark") + manager_benchmark = benchmark_build_dir.joinpath("gptManagerBenchmark") + if not benchmark_build_dir.exists() or not session_benchmark.exists() or not manager_benchmark.exists(): + logger.error(f"benchmark_build_dir: {benchmark_build_dir} not exists, please build benckmark first, cd cpp/build/benchmarks && make") + sys.exit(-1) + + benchmark_dir = trtllm_dir.joinpath("benchmarks", "cpp") + prepare_dataset_script = benchmark_dir.joinpath("prepare_dataset.py") + if not benchmark_dir.exists() or not prepare_dataset_script.exists(): + logger.error(f"{prepare_dataset_script} not exists") + sys.exit(-1) + + # check engine + engine_dir = pathlib.Path(args.engine_dir).absolute() + if not engine_dir.exists(): + logger.error(f"engine_dir: {engine_dir} not exists") + sys.exit(-1) + + # check model + model_dir = pathlib.Path(args.model_dir).absolute() + if not model_dir.exists(): + logger.error(f"model_dir: {model_dir} not exists") + sys.exit(-1) + + # check batch_size_list + if args.batch_size_list is None: + logger.error("batch_size_list is None") + sys.exit(-1) + batch_size_list = [int(batch_size) for batch_size in args.batch_size_list.split(",")] + + # check seq_len_list + if args.seq_len_list is None: + logger.error("seq_len_list is None") + sys.exit(-1) + seq_len_list = [int(seq_len) for seq_len in args.seq_len_list.split(",")] + + # workspace + workspace = pathlib.Path(args.workspace).absolute() + if not workspace.exists(): + workspace.mkdir(parents=True) + + return ( + workspace, + session_benchmark, manager_benchmark, prepare_dataset_script, + engine_dir, model_dir, + batch_size_list, seq_len_list + ) + + + +def context_perf(session_benchmark, engine_dir, seq_len_list): + print("") + engine_config = engine_dir.joinpath("config.json") + + config_data = json.loads(engine_config.read_text()) + + max_batch_size = config_data["build_config"]["max_batch_size"] + max_input_len = config_data["build_config"]["max_input_len"] + max_seq_len = config_data["build_config"]["max_seq_len"] + max_num_tokens = config_data["build_config"]["max_num_tokens"] + + tp_size = config_data["build_config"]["auto_parallel_config"]["gpus_per_node"] + device_name = config_data["build_config"]["auto_parallel_config"]["cluster_key"] + device_info = config_data["build_config"]["auto_parallel_config"]["cluster_info"] + + + for seq_len in seq_len_list: + if seq_len > max_num_tokens: + logger.warning(f"seq_len: {seq_len} > max_num_tokens: {max_num_tokens}, skip") + continue + + run_cmd = f"mpirun --allow-run-as-root -n {tp_size} {session_benchmark}" + run_cmd += f" --engine_dir {engine_dir}" + run_cmd += f" --batch_size 1" + run_cmd += f" --warm_up 2 --num_runs 20" + run_cmd += f" --input_output_len \"{seq_len},1\"" + + results = subprocess.run(run_cmd, shell=True, capture_output=True, text=True) + if results.returncode != 0: + logger.error(f"run cmd: {run_cmd} failed, returncode: {results.returncode}, stderr: {results.stderr}") + sys.exit(-1) + for line in results.stdout.splitlines(): + if line.startswith("[BENCHMARK]"): + try: + data_items = line.split() + batch_size = int(data_items[2]) + seq_len = int(data_items[4]) + output_len = int(data_items[6]) + latency = float(data_items[8]) + except Exception as e: + logger.error(f"parse line: {line} failed, error: {e}") + sys.exit(-1) + logger.info(f"prefill, batch_size: {batch_size}, seq_len: {seq_len}, latency: {latency} ms") + + + +def decode_perf(workspace, manager_benchmark, prepare_dataset_script, engine_dir, model_path, batch_size_list, seq_len_list): + print("") + engine_config = engine_dir.joinpath("config.json") + + config_data = json.loads(engine_config.read_text()) + + max_batch_size = config_data["build_config"]["max_batch_size"] + max_input_len = config_data["build_config"]["max_input_len"] + max_seq_len = config_data["build_config"]["max_seq_len"] + max_num_tokens = config_data["build_config"]["max_num_tokens"] + + tp_size = config_data["build_config"]["auto_parallel_config"]["gpus_per_node"] + device_name = config_data["build_config"]["auto_parallel_config"]["cluster_key"] + device_info = config_data["build_config"]["auto_parallel_config"]["cluster_info"] + + for seq_len in seq_len_list: + if seq_len > max_num_tokens: + logger.warning(f"seq_len: {seq_len} > max_num_tokens: {max_num_tokens}, skip") + continue + + seq_workspace = workspace.joinpath(f"seq_{seq_len}") + seq_workspace.mkdir(parents=True, exist_ok=True) + + context_generate_tokens = 1 + decode_generate_tokens = 101 + + context_dataset = seq_workspace.joinpath(f"context_{seq_len}_{context_generate_tokens}.json") + decode_dataset = seq_workspace.joinpath(f"decode_{seq_len}_{decode_generate_tokens}.json") + + prepare_dataset_cmd = f"python3 {prepare_dataset_script}" + prepare_dataset_cmd += f" --output {context_dataset}" + prepare_dataset_cmd += f" --tokenizer {model_path}" + prepare_dataset_cmd += f" token-norm-dist --num-requests {max_batch_size}" + prepare_dataset_cmd += f" --input-mean {seq_len} --input-stdev 0" + prepare_dataset_cmd += f" --output-mean {context_generate_tokens} --output-stdev 0" + subprocess.run(prepare_dataset_cmd, shell=True, capture_output=True, text=True) + + prepare_dataset_cmd = f"python3 {prepare_dataset_script}" + prepare_dataset_cmd += f" --output {decode_dataset}" + prepare_dataset_cmd += f" --tokenizer {model_path}" + prepare_dataset_cmd += f" token-norm-dist --num-requests {max_batch_size}" + prepare_dataset_cmd += f" --input-mean {seq_len} --input-stdev 0" + prepare_dataset_cmd += f" --output-mean {decode_generate_tokens} --output-stdev 0" + subprocess.run(prepare_dataset_cmd, shell=True, capture_output=True, text=True) + + for batch_size in batch_size_list: + if batch_size > max_batch_size: + logger.warning(f"batch_size: {batch_size} > max_batch_size: {max_batch_size}, skip") + continue + + context_csv = seq_workspace.joinpath(f"context_batch{batch_size}.csv") + decode_csv = seq_workspace.joinpath(f"decode_batch{batch_size}.csv") + + # context + run_cmd = f"mpirun --allow-run-as-root -n 8 {manager_benchmark}" + run_cmd += f" --engine_dir {engine_dir}" + run_cmd += f" --type IFB" + run_cmd += f" --max_num_tokens {min(int(seq_len * 1.5), int(max_num_tokens))}" + run_cmd += f" --max_num_samples {batch_size}" + run_cmd += f" --static_emulated_batch_size {batch_size}" + run_cmd += f" --enable_kv_cache_reuse false" + run_cmd += f" --dataset {context_dataset}" + run_cmd += f" --output_csv {context_csv}" + subprocess.run(run_cmd, shell=True, capture_output=True, text=True) + + # decode + run_cmd = f"mpirun --allow-run-as-root -n 8 {manager_benchmark}" + run_cmd += f" --engine_dir {engine_dir}" + run_cmd += f" --type IFB" + run_cmd += f" --max_num_tokens {min(int(seq_len * 1.5), int(max_num_tokens))}" + run_cmd += f" --max_num_samples {batch_size}" + run_cmd += f" --static_emulated_batch_size {batch_size}" + run_cmd += f" --enable_kv_cache_reuse false" + run_cmd += f" --dataset {decode_dataset}" + run_cmd += f" --output_csv {decode_csv}" + subprocess.run(run_cmd, shell=True, capture_output=True, text=True) + + if context_csv.exists() and decode_csv.exists(): + try: + context_latency = float(context_csv.read_text().splitlines()[1].split(",")[2]) + decode_latency = float(decode_csv.read_text().splitlines()[1].split(",")[2]) + except Exception as e: + logger.error(f"parse context_csv: {context_csv} and decode_csv: {decode_csv} failed, error: {e}") + continue + + per_token_latency = round((decode_latency - context_latency) / (decode_generate_tokens - context_generate_tokens), 3) + logger.info(f"decode, batch_size: {batch_size}, seq_len: {seq_len}, latency: {per_token_latency} ms") + + break + + +if __name__ == "__main__": + workspace, session_benchmark, manager_benchmark, prepare_dataset_script, engine_dir, model_dir, batch_size_list, seq_len_list = parse_args() + + logger.info(f"session_benchmark: {session_benchmark}") + logger.info(f"manager_benchmark: {manager_benchmark}") + logger.info(f"engine_dir: {engine_dir}") + logger.info(f"batch_size_list: {batch_size_list}") + logger.info(f"seq_len_list: {seq_len_list}") + + context_perf(session_benchmark, engine_dir, seq_len_list) + decode_perf(workspace, manager_benchmark, prepare_dataset_script, engine_dir, model_dir, batch_size_list, seq_len_list)