From bd0e8d5c0a6f5b7e3852f5dd4922f34a9866359f Mon Sep 17 00:00:00 2001 From: Yuqing Jiang Date: Mon, 23 Oct 2023 11:44:38 -0700 Subject: [PATCH] Fix profile options argument: allow disabling selected options Summary: After enabling detailed profiling by default in D49785828, --profile-options becomes unusable because the logic of --no-profile-detailed is all-or-nothing. Fix this by changing --profile-options to --disable-profile-options. Most of the changes are auto formatting. (I marked the logic changes with diff comments below) Reviewed By: davidberard98 Differential Revision: D50524522 fbshipit-source-id: a74624c6f9b0d1619804161f7c132687baf82fd4 --- run.py | 231 +++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 175 insertions(+), 56 deletions(-) diff --git a/run.py b/run.py index 2ff7db533a..26012eb8ce 100644 --- a/run.py +++ b/run.py @@ -57,7 +57,7 @@ def run_one_step_with_cudastreams(func, streamcount): for i in range(1, streamcount + 1, 1): # create additional streams and prime with load - while len(streamlist) < i : + while len(streamlist) < i: s = torch.cuda.Stream() streamlist.append(s) @@ -80,50 +80,110 @@ def run_one_step_with_cudastreams(func, streamcount): torch.cuda.synchronize() print(f"Cuda StreamCount:{len(streamlist)}") - print('{:<20} {:>20}'.format("GPU Time:", "%.3f milliseconds" % start_event.elapsed_time(end_event)), sep='') + print( + "{:<20} {:>20}".format( + "GPU Time:", "%.3f milliseconds" % start_event.elapsed_time(end_event) + ), + sep="", + ) -def printResultSummaryTime(result_summary, model, metrics_needed=[], flops_model_analyzer=None, model_flops=None, cpu_peak_mem=None, mem_device_id=None, gpu_peak_mem=None): - assert (model is not None), "model can not be None." +def printResultSummaryTime( + result_summary, + model, + metrics_needed=[], + flops_model_analyzer=None, + model_flops=None, + cpu_peak_mem=None, + mem_device_id=None, + gpu_peak_mem=None, +): + assert model is not None, "model can not be None." if args.device == "cuda": gpu_time = np.median(list(map(lambda x: x[0], result_summary))) cpu_walltime = np.median(list(map(lambda x: x[1], result_summary))) - print('{:<20} {:>20}'.format("GPU Time per batch:", "%.3f milliseconds" % - (gpu_time / model.num_batch), sep='')) - print('{:<20} {:>20}'.format("CPU Wall Time per batch:", "%.3f milliseconds" % - (cpu_walltime / model.num_batch), sep='')) + print( + "{:<20} {:>20}".format( + "GPU Time per batch:", + "%.3f milliseconds" % (gpu_time / model.num_batch), + sep="", + ) + ) + print( + "{:<20} {:>20}".format( + "CPU Wall Time per batch:", + "%.3f milliseconds" % (cpu_walltime / model.num_batch), + sep="", + ) + ) else: cpu_walltime = np.median(list(map(lambda x: x[0], result_summary))) - print('{:<20} {:>20}'.format("CPU Wall Time per batch:", "%.3f milliseconds" % (cpu_walltime / model.num_batch), sep='')) + print( + "{:<20} {:>20}".format( + "CPU Wall Time per batch:", + "%.3f milliseconds" % (cpu_walltime / model.num_batch), + sep="", + ) + ) # if model_flops is not None, output the TFLOPs per sec - if 'flops' in metrics_needed: - if flops_model_analyzer.metrics_backend_mapping['flops'] == 'dcgm': + if "flops" in metrics_needed: + if flops_model_analyzer.metrics_backend_mapping["flops"] == "dcgm": tflops_device_id, tflops = flops_model_analyzer.calculate_flops() else: flops = model.get_flops() tflops = flops / (cpu_walltime / 1.0e3) / 1.0e12 - print('{:<20} {:>20}'.format("GPU FLOPS:", "%.4f TFLOPs per second" % tflops, sep='')) - if 'ttfb' in metrics_needed: - print('{:<20} {:>20}'.format("Time to first batch:", "%.4f ms" % model.ttfb, sep='')) + print( + "{:<20} {:>20}".format( + "GPU FLOPS:", "%.4f TFLOPs per second" % tflops, sep="" + ) + ) + if "ttfb" in metrics_needed: + print( + "{:<20} {:>20}".format( + "Time to first batch:", "%.4f ms" % model.ttfb, sep="" + ) + ) if model_flops is not None: tflops = model_flops / (cpu_walltime / 1.0e3) / 1.0e12 - print('{:<20} {:>20}'.format("Model Flops:", "%.4f TFLOPs per second" % tflops, sep='')) + print( + "{:<20} {:>20}".format( + "Model Flops:", "%.4f TFLOPs per second" % tflops, sep="" + ) + ) if gpu_peak_mem is not None: - print('{:<20} {:>20}'.format("GPU %d Peak Memory:" % mem_device_id, "%.4f GB" % gpu_peak_mem, sep='')) + print( + "{:<20} {:>20}".format( + "GPU %d Peak Memory:" % mem_device_id, "%.4f GB" % gpu_peak_mem, sep="" + ) + ) if cpu_peak_mem is not None: - print('{:<20} {:>20}'.format("CPU Peak Memory:", "%.4f GB" % cpu_peak_mem, sep='')) + print( + "{:<20} {:>20}".format("CPU Peak Memory:", "%.4f GB" % cpu_peak_mem, sep="") + ) -def run_one_step(func, model, nwarmup=WARMUP_ROUNDS, num_iter=10, export_metrics_file=None, stress=0, metrics_needed=[], metrics_gpu_backend=None): +def run_one_step( + func, + model, + nwarmup=WARMUP_ROUNDS, + num_iter=10, + export_metrics_file=None, + stress=0, + metrics_needed=[], + metrics_gpu_backend=None, +): # Warm-up `nwarmup` rounds for _i in range(nwarmup): func() result_summary = [] flops_model_analyzer = None - if 'flops' in metrics_needed: + if "flops" in metrics_needed: from components.model_analyzer.TorchBenchAnalyzer import ModelAnalyzer - flops_model_analyzer = ModelAnalyzer(export_metrics_file, ['flops'], metrics_gpu_backend) + + flops_model_analyzer = ModelAnalyzer( + export_metrics_file, ["flops"], metrics_gpu_backend + ) flops_model_analyzer.start_monitor() if stress: @@ -135,7 +195,7 @@ def run_one_step(func, model, nwarmup=WARMUP_ROUNDS, num_iter=10, export_metrics _i = 0 last_it = 0 first_print_out = True - while (not stress and _i < num_iter) or (stress and cur_time < target_time) : + while (not stress and _i < num_iter) or (stress and cur_time < target_time): if args.device == "cuda": torch.cuda.synchronize() start_event = torch.cuda.Event(enable_timing=True) @@ -149,7 +209,9 @@ def run_one_step(func, model, nwarmup=WARMUP_ROUNDS, num_iter=10, export_metrics end_event.record() torch.cuda.synchronize() t1 = time.time_ns() - result_summary.append((start_event.elapsed_time(end_event), (t1 - t0) / 1_000_000)) + result_summary.append( + (start_event.elapsed_time(end_event), (t1 - t0) / 1_000_000) + ) elif args.device == "mps": t0 = time.time_ns() func() @@ -167,11 +229,19 @@ def run_one_step(func, model, nwarmup=WARMUP_ROUNDS, num_iter=10, export_metrics # print out the status every 10s. if (cur_time - last_time) >= 10 * 1e9: if first_print_out: - print('|{:^20}|{:^20}|{:^20}|'.format("Iterations", "Time/Iteration(ms)", "Rest Time(s)")) + print( + "|{:^20}|{:^20}|{:^20}|".format( + "Iterations", "Time/Iteration(ms)", "Rest Time(s)" + ) + ) first_print_out = False est = (target_time - cur_time) / 1e9 time_per_it = (cur_time - last_time) / (_i - last_it) / 1e6 - print('|{:^20}|{:^20}|{:^20}|'.format("%d" % _i, "%.2f" % time_per_it , "%d" % int(est))) + print( + "|{:^20}|{:^20}|{:^20}|".format( + "%d" % _i, "%.2f" % time_per_it, "%d" % int(est) + ) + ) last_time = cur_time last_it = _i _i += 1 @@ -183,38 +253,64 @@ def run_one_step(func, model, nwarmup=WARMUP_ROUNDS, num_iter=10, export_metrics gpu_peak_mem = None mem_device_id = None model_flops = None - if 'cpu_peak_mem' in metrics_needed or 'gpu_peak_mem' in metrics_needed: - cpu_peak_mem, mem_device_id, gpu_peak_mem = get_peak_memory(func, model.device, export_metrics_file=export_metrics_file, metrics_needed=metrics_needed, metrics_gpu_backend=metrics_gpu_backend) - if 'model_flops' in metrics_needed: + if "cpu_peak_mem" in metrics_needed or "gpu_peak_mem" in metrics_needed: + cpu_peak_mem, mem_device_id, gpu_peak_mem = get_peak_memory( + func, + model.device, + export_metrics_file=export_metrics_file, + metrics_needed=metrics_needed, + metrics_gpu_backend=metrics_gpu_backend, + ) + if "model_flops" in metrics_needed: model_flops = get_model_flops(model) - printResultSummaryTime(result_summary, model, metrics_needed, flops_model_analyzer, model_flops, cpu_peak_mem, mem_device_id, gpu_peak_mem) + printResultSummaryTime( + result_summary, + model, + metrics_needed, + flops_model_analyzer, + model_flops, + cpu_peak_mem, + mem_device_id, + gpu_peak_mem, + ) def profile_one_step(func, model, nwarmup=WARMUP_ROUNDS): activity_groups = [] result_summary = [] - device_to_activity = {'cuda': profiler.ProfilerActivity.CUDA, 'cpu': profiler.ProfilerActivity.CPU} + device_to_activity = { + "cuda": profiler.ProfilerActivity.CUDA, + "cpu": profiler.ProfilerActivity.CPU, + } if args.profile_devices: activity_groups = [ - device_to_activity[device] for device in args.profile_devices if (device in device_to_activity) + device_to_activity[device] + for device in args.profile_devices + if (device in device_to_activity) ] else: - if args.device == 'cuda': + if args.device == "cuda": activity_groups = [ profiler.ProfilerActivity.CUDA, profiler.ProfilerActivity.CPU, ] - elif args.device == 'cpu': + elif args.device == "cpu": activity_groups = [profiler.ProfilerActivity.CPU] profile_opts = {} + for opt in SUPPORT_PROFILE_LIST: - profile_opts[opt] = True if args.profile_options is not None and opt in args.profile_options else False + profile_opts[opt] = False if args.no_profile_detailed else True + # options can be overriden by disable-profile-options + if args.disable_profile_options is not None and opt in args.disable_profile_options: + profile_opts[opt] = False if args.profile_eg: - from datetime import datetime import os + from datetime import datetime + from torch.profiler import ExecutionTraceObserver + start_time = datetime.now() timestamp = int(datetime.timestamp(start_time)) eg_file = f"{args.model}_{timestamp}_eg.json" @@ -227,12 +323,17 @@ def profile_one_step(func, model, nwarmup=WARMUP_ROUNDS): with profiler.profile( schedule=profiler.schedule(wait=0, warmup=nwarmup, active=1, repeat=1), activities=activity_groups, - record_shapes=args.no_profile_detailed if args.no_profile_detailed else profile_opts["record_shapes"], - profile_memory=args.no_profile_detailed if args.no_profile_detailed else profile_opts["profile_memory"], - with_stack=args.no_profile_detailed if args.no_profile_detailed else profile_opts["with_stack"], - with_flops=args.no_profile_detailed if args.no_profile_detailed else profile_opts["with_flops"], - with_modules=args.no_profile_detailed if args.no_profile_detailed else profile_opts["with_modules"], - on_trace_ready= partial(trace_handler, f"torchbench_{args.model}") if (not hasattr(torch.version, "git_version") and args.profile_export_chrome_trace) else profiler.tensorboard_trace_handler(args.profile_folder), + record_shapes=profile_opts["record_shapes"], + profile_memory=profile_opts["profile_memory"], + with_stack=profile_opts["with_stack"], + with_flops=profile_opts["with_flops"], + with_modules=profile_opts["with_modules"], + on_trace_ready=partial(trace_handler, f"torchbench_{args.model}") + if ( + not hasattr(torch.version, "git_version") + and args.profile_export_chrome_trace + ) + else profiler.tensorboard_trace_handler(args.profile_folder), ) as prof: if args.device == "cuda": start_event = torch.cuda.Event(enable_timing=True) @@ -245,10 +346,12 @@ def profile_one_step(func, model, nwarmup=WARMUP_ROUNDS): end_event.record() t1 = time.time_ns() if i >= nwarmup: - result_summary.append((start_event.elapsed_time(end_event), (t1 - t0) / 1_000_000)) + result_summary.append( + (start_event.elapsed_time(end_event), (t1 - t0) / 1_000_000) + ) prof.step() else: - for i in range(nwarmup + 1): + for i in range(nwarmup + 1): t0 = time.time_ns() func() t1 = time.time_ns() @@ -259,17 +362,24 @@ def profile_one_step(func, model, nwarmup=WARMUP_ROUNDS): eg.stop() eg.unregister_callback() print(f"Save Exeution Trace to : {args.profile_eg_folder}/{eg_file}") - print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=30)) + print( + prof.key_averages(group_by_input_shape=True).table( + sort_by="cpu_time_total", row_limit=30 + ) + ) print(f"Saved TensorBoard Profiler traces to {args.profile_folder}.") printResultSummaryTime(result_summary, model=m) + def _validate_devices(devices: str): devices_list = devices.split(",") valid_devices = SUPPORT_DEVICE_LIST for d in devices_list: if d not in valid_devices: - raise ValueError(f'Invalid device {d} passed into --profile-devices. Expected devices: {valid_devices}.') + raise ValueError( + f"Invalid device {d} passed into --profile-devices. Expected devices: {valid_devices}." + ) return devices_list @@ -277,7 +387,9 @@ def _validate_profile_options(profile_options: str): profile_options_list = profile_options.split(",") for opt in profile_options_list: if opt not in SUPPORT_PROFILE_LIST: - raise ValueError(f'Invalid profile option {opt} passed into --profile-options. Expected options: {SUPPORT_PROFILE_LIST}.') + raise ValueError( + f"Invalid profile option {opt} passed into --profile-options. Expected options: {SUPPORT_PROFILE_LIST}." + ) return profile_options_list @@ -305,9 +417,9 @@ def _validate_profile_options(profile_options: str): "--profile", action="store_true", help="Run the profiler around the function" ) parser.add_argument( - "--profile-options", + "--disable-profile-options", type=_validate_profile_options, - help=f"Select which profile options to enable. Valid options: {SUPPORT_PROFILE_LIST}.", + help=f"Select which profile options to disable. Valid options: {SUPPORT_PROFILE_LIST}.", ) parser.add_argument("--amp", action="store_true", help="enable torch.autocast()") parser.add_argument( @@ -317,8 +429,9 @@ def _validate_profile_options(profile_options: str): ) parser.add_argument( "--no-profile-detailed", - action="store_false", - help=f"Only profile GPU kernels, excluding {SUPPORT_PROFILE_LIST}. Overrides --profile-options.", + action="store_true", + help=f"Only profile GPU kernels, excluding {SUPPORT_PROFILE_LIST}. " + "To only disable some profile options, use --disable-profile-options instead.", ) parser.add_argument( "--profile-export-chrome-trace", @@ -359,14 +472,23 @@ def _validate_profile_options(profile_options: str): "--metrics", type=str, default="cpu_peak_mem,gpu_peak_mem,ttfb", - help="Specify metrics [cpu_peak_mem,gpu_peak_mem,ttfb,flops,model_flops]to be collected. You can also set `none` to disable all metrics. The metrics are separated by comma such as cpu_peak_mem,gpu_peak_mem.", + help="Specify metrics [cpu_peak_mem,gpu_peak_mem,ttfb,flops,model_flops]to be collected. " + "You can also set `none` to disable all metrics. The metrics are separated by comma such as cpu_peak_mem,gpu_peak_mem.", ) parser.add_argument( "--metrics-gpu-backend", choices=["dcgm", "default"], default="default", - help="""Specify the backend [dcgm, default] to collect metrics. \nIn default mode, the latency(execution time) is collected by time.time_ns() and it is always enabled. Optionally, - \n - you can specify cpu peak memory usage by --metrics cpu_peak_mem, and it is collected by psutil.Process(). \n - you can specify gpu peak memory usage by --metrics gpu_peak_mem, and it is collected by nvml library.\n - you can specify flops by --metrics flops, and it is collected by fvcore.\nIn dcgm mode, the latency(execution time) is collected by time.time_ns() and it is always enabled. Optionally,\n - you can specify cpu peak memory usage by --metrics cpu_peak_mem, and it is collected by psutil.Process().\n - you can specify cpu and gpu peak memory usage by --metrics cpu_peak_mem,gpu_peak_mem, and they are collected by dcgm library.""", + help=""" + Specify the backend [dcgm, default] to collect metrics. + In default mode, the latency(execution time) is collected by time.time_ns() and it is always enabled. + Optionally, - you can specify cpu peak memory usage by --metrics cpu_peak_mem, and it is collected by psutil.Process(). + - you can specify gpu peak memory usage by --metrics gpu_peak_mem, and it is collected by nvml library. + - you can specify flops by --metrics flops, and it is collected by fvcore. + In dcgm mode, the latency(execution time) is collected by time.time_ns() and it is always enabled. + Optionally, + - you can specify cpu peak memory usage by --metrics cpu_peak_mem, and it is collected by psutil.Process(). + - you can specify cpu and gpu peak memory usage by --metrics cpu_peak_mem,gpu_peak_mem, and they are collected by dcgm library.""", ) parser.add_argument( "--channels-last", action="store_true", help="enable torch.channels_last()" @@ -464,10 +586,7 @@ def _validate_profile_options(profile_options: str): else: export_metrics_file = None if args.profile: - profile_one_step( - test, - model=m - ) + profile_one_step(test, model=m) elif args.cudastreams: run_one_step_with_cudastreams(test, 10) else: