From 6b0d4a362249b3f92c99f6428e9ce166868d643c Mon Sep 17 00:00:00 2001 From: "qiurc (Meta Employee)" Date: Thu, 19 Dec 2024 13:23:16 -0800 Subject: [PATCH] Support garbage collection after pt2 compilation (#143364) Summary: **Context:** recently we observed ~10% Training GPU memory regression, due to the not efficient recycling of the memory at Pytorch2 compilation time. This diff is to save the memory regression caused by the PT2 compilation. Detailed debugging notes: https://docs.google.com/document/d/1EPopAyYyXwTnkyVaUJ5Xa_Uw9iWv3zimK7FkagKsKIY/edit?tab=t.0#bookmark=id.e5b26tcdfl5g In this diff, we support garbage collection after pt2 compilation. **Rollout / rollback plan:** To ensure the system reliability, we design 2 layers of control for this change's rollout: - Add jk to control the global rollout / rollback of this functionality. The jk is on by default - Add env var to control individual job's rollout. The env var is on by default. X-link: https://github.com/pytorch/pytorch/pull/143364 Approved by: https://github.com/ezyang Reviewed By: ezyang Differential Revision: D67328568 Pulled By: huydhn fbshipit-source-id: d0c856846bef3bdd3b060df90cf5888d57245ff8 --- userbenchmark/dynamo/dynamobench/_dynamo/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/userbenchmark/dynamo/dynamobench/_dynamo/utils.py b/userbenchmark/dynamo/dynamobench/_dynamo/utils.py index 677acf0b6..b1035da05 100644 --- a/userbenchmark/dynamo/dynamobench/_dynamo/utils.py +++ b/userbenchmark/dynamo/dynamobench/_dynamo/utils.py @@ -917,6 +917,7 @@ class CompilationMetrics: feature_usage: Optional[dict[str, bool]] = None compile_time_autotune_time_us: Optional[int] = None is_runtime: Optional[bool] = False + gc_time_us: Optional[int] = None DEFAULT_COMPILATION_METRICS_LIMIT = 64