microsoft · PatriceVignola · May 7, 2024 · May 4, 2024 · May 4, 2024
diff --git a/benchmark/python/benchmark_e2e.py b/benchmark/python/benchmark_e2e.py
@@ -85,6 +85,9 @@ def main(args):
             generator.generate_next_token()
         if args.print_model_output: print(tokenizer.decode(generator.get_sequence(0)))
 
+        # Delete the generator to free the captured graph for the next generator, if graph capture is enabled
+        del generator
+
     tokenize_times = []
     prompt_times = []
     token_gen_times = []
@@ -141,6 +144,9 @@ def main(args):
         wall_clock_times.append(wall_clock_end_time - wall_clock_start_time)
         if args.print_model_output: print(tokenizer.decode(generator.get_sequence(0)))
 
+        # Delete the generator to free the captured graph for the next generator, if graph capture is enabled
+        del generator
+
     # Calculate tokenization metrics
     avg_tokenization_latency_s = sum(tokenize_times) / len(tokenize_times)
     avg_tokenization_latency_ms = avg_tokenization_latency_s * 1000

diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py
@@ -72,6 +72,9 @@ def main(args):
         print()
         print()
 
+        # Delete the generator to free the captured graph for the next generator, if graph capture is enabled
+        del generator
+
         if args.timings:
             prompt_time = first_token_timestamp - started_timestamp
             run_time = time.time() - first_token_timestamp

diff --git a/examples/python/phi3-qa.py b/examples/python/phi3-qa.py
@@ -69,6 +69,9 @@ def main(args):
         print()
         print()
 
+        # Delete the generator to free the captured graph for the next generator, if graph capture is enabled
+        del generator
+
         if args.timings:
             prompt_time = first_token_timestamp - started_timestamp
             run_time = time.time() - first_token_timestamp