Measure GIL contention in benchmarks (#937)

Add argument to measure and report GIL contention in benchmarks. This may provide some useful insight when optimizing asyncio-related changes. Sample results observed are below. <details><summary>Small messages (1 B)</summary> ``` $ python -m ucp.benchmarks.send_recv --backend ucp-core --n-bytes 1 --n-iter 100_000 --no-detailed-report --report-gil-contention ... GIL contention | 0.688429594039917 $ python -m ucp.benchmarks.send_recv --backend ucp-async --n-bytes 1 --n-iter 100_000 --no-detailed-report --report-gil-contention ... GIL contention | 0.4772491455078125 $ UCXPY_NON_BLOCKING_MODE=1 python -m ucp.benchmarks.send_recv --backend ucp-async --n-bytes 1 --n-iter 100_000 --no-detailed-report --report-gil-contention ... GIL contention | 0.8763511180877686 ``` </details> <details><summary>Medium messages (1 MiB)</summary> ``` $ python -m ucp.benchmarks.send_recv --backend ucp-core --n-bytes 1MiB --n-iter 100 --no-detailed-report --report-gil-contention ... GIL contention | 0.1721574366092682 $ python -m ucp.benchmarks.send_recv --backend ucp-async --n-bytes 1MiB --n-iter 100 --no-detailed-report --report-gil-contention ... GIL contention | 0.25215959548950195 $ UCXPY_NON_BLOCKING_MODE=1 python -m ucp.benchmarks.send_recv --backend ucp-async --n-bytes 1MiB --n-iter 100 --no-detailed-report --report-gil-contention ... GIL contention | 0.2693021595478058 ``` </details> <details><summary>Large messages (1 GiB)</summary> ``` $ python -m ucp.benchmarks.send_recv --backend ucp-core --n-bytes 1 GiB --n-iter 10 --no-detailed-report --report-gil-contention ... GIL contention | 0.5908554792404175 $ python -m ucp.benchmarks.send_recv --backend ucp-async --n-bytes 1GiB --n-iter 10 --no-detailed-report --report-gil-contention ... GIL contention | 0.6251853108406067 $ UCXPY_NON_BLOCKING_MODE=1 python -m ucp.benchmarks.send_recv --backend ucp-async --n-bytes 1GiB --n-iter 10 --no-detailed-report --report-gil-contention ... GIL contention | 0.832139790058136 ``` </details> Authors: - Peter Andreas Entschev (https://github.com/pentschev) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: #937
rapidsai · Oct 13, 2023 · 9c17700 · 9c17700
1 parent 02918e8
commit 9c17700
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 0 deletions.
diff --git a/ucp/benchmarks/backends/tornado.py b/ucp/benchmarks/backends/tornado.py
@@ -86,6 +86,14 @@ async def run(self) -> bool:
             recv_msg = np.zeros(self.args.n_bytes, dtype="u1")
             assert recv_msg.nbytes == self.args.n_bytes
 
+        if self.args.report_gil_contention:
+            from gilknocker import KnockKnock
+
+            # Use smallest polling interval possible to ensure, contention will always
+            # be zero for small messages otherwise and inconsistent for large messages.
+            knocker = KnockKnock(polling_interval_micros=1)
+            knocker.start()
+
         times = []
         for i in range(self.args.n_iter + self.args.n_warmup_iter):
             start = monotonic()
@@ -99,4 +107,10 @@ async def run(self) -> bool:
             stop = monotonic()
             if i >= self.args.n_warmup_iter:
                 times.append(stop - start)
+
+        if self.args.report_gil_contention:
+            knocker.stop()
+
         self.queue.put(times)
+        if self.args.report_gil_contention:
+            self.queue.put(knocker.contention_metric)
diff --git a/ucp/benchmarks/backends/ucp_async.py b/ucp/benchmarks/backends/ucp_async.py
@@ -128,6 +128,15 @@ async def run(self):
 
         if self.args.cuda_profile:
             xp.cuda.profiler.start()
+
+        if self.args.report_gil_contention:
+            from gilknocker import KnockKnock
+
+            # Use smallest polling interval possible to ensure, contention will always
+            # be zero for small messages otherwise and inconsistent for large messages.
+            knocker = KnockKnock(polling_interval_micros=1)
+            knocker.start()
+
         times = []
         for i in range(self.args.n_iter + self.args.n_warmup_iter):
             start = monotonic()
@@ -143,9 +152,15 @@ async def run(self):
             stop = monotonic()
             if i >= self.args.n_warmup_iter:
                 times.append(stop - start)
+
+        if self.args.report_gil_contention:
+            knocker.stop()
         if self.args.cuda_profile:
             xp.cuda.profiler.stop()
+
         self.queue.put(times)
+        if self.args.report_gil_contention:
+            self.queue.put(knocker.contention_metric)
 
     def print_backend_specific_config(self):
         print_key_value(

diff --git a/ucp/benchmarks/backends/ucp_core.py b/ucp/benchmarks/backends/ucp_core.py
@@ -263,6 +263,13 @@ def op_completed():
 
         if self.args.cuda_profile:
             xp.cuda.profiler.start()
+        if self.args.report_gil_contention:
+            from gilknocker import KnockKnock
+
+            # Use smallest polling interval possible to ensure, contention will always
+            # be zero for small messages otherwise and inconsistent for large messages.
+            knocker = KnockKnock(polling_interval_micros=1)
+            knocker.start()
 
         times = []
         last_iter = self.args.n_iter + self.args.n_warmup_iter - 1
@@ -292,10 +299,14 @@ def op_completed():
             if i >= self.args.n_warmup_iter:
                 times.append(stop - start)
 
+        if self.args.report_gil_contention:
+            knocker.stop()
         if self.args.cuda_profile:
             xp.cuda.profiler.stop()
 
         self.queue.put(times)
+        if self.args.report_gil_contention:
+            self.queue.put(knocker.contention_metric)
 
     def print_backend_specific_config(self):
         delay_progress_str = (

diff --git a/ucp/benchmarks/send_recv.py b/ucp/benchmarks/send_recv.py
@@ -97,6 +97,8 @@ def client(queue, port, server_address, args):
         client.run()
 
     times = queue.get()
+    if args.report_gil_contention:
+        contention_metric = queue.get()
 
     assert len(times) == args.n_iter
     bw_avg = format_bytes(2 * args.n_iter * args.n_bytes / sum(times))
@@ -133,6 +135,8 @@ def client(queue, port, server_address, args):
     print_key_value("Bandwidth (median)", value=f"{bw_med}/s")
     print_key_value("Latency (average)", value=f"{lat_avg} ns")
     print_key_value("Latency (median)", value=f"{lat_med} ns")
+    if args.report_gil_contention:
+        print_key_value("GIL contention", value=f"{contention_metric}")
     if not args.no_detailed_report:
         print_separator(separator="=")
         print_key_value(key="Iterations", value="Bandwidth, Latency")
@@ -297,6 +301,12 @@ def parse_args():
         help="Backend Library (-l) to use, options are: 'ucp-async' (default), "
         "'ucp-core' and 'tornado'.",
     )
+    parser.add_argument(
+        "--report-gil-contention",
+        default=False,
+        action="store_true",
+        help="Report GIL contention (requires the `gilknocker` package).",
+    )
     parser.add_argument(
         "--delay-progress",
         default=False,
@@ -337,6 +347,15 @@ def parse_args():
     if args.backend != "ucp-core" and args.delay_progress:
         raise RuntimeError("`--delay-progress` requires `--backend=ucp-core`")
 
+    if args.report_gil_contention:
+        try:
+            import gilknocker  # noqa: F401
+        except ImportError:
+            raise RuntimeError(
+                "Could not import `gilknocker`. Make sure it is installed or "
+                "remove the `--report-gil-contention` argument."
+            )
+
     return args