From c7452697d80906011ee4aed4cd7d9b3854e491ef Mon Sep 17 00:00:00 2001
From: Vincent Ehrmanntraut <ehrmanntraut@itsec.rwth-aachen.de>
Date: Fri, 28 Jun 2024 09:44:11 +0200
Subject: [PATCH] Change MATMULSM allocator timeout from a time threshould to a
 size threshold

---
 Compiler/allocator.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/Compiler/allocator.py b/Compiler/allocator.py
index 05b6dbccd..2aaac2015 100644
--- a/Compiler/allocator.py
+++ b/Compiler/allocator.py
@@ -600,15 +600,24 @@ def keep_text_order(inst, n):
                             first_factor_row_length = instr.args[12 * matmul_idx + 10]
                             second_factor_row_length = instr.args[12 * matmul_idx + 11]
 
-                            # Add dependencies to the first factor.
-                            for i in range(instr.args[12 * matmul_idx + 3]):
-                                if (time.time() - start_time) > 10:
-                                    # Abort building the dependencies if that takes too much time.
-                                    if block.warn_about_mem and not block.parent.warned_about_mem:
-                                        print('WARNING: Order of memory instructions not preserved due to long vector, errors possible')
-                                        block.parent.warned_about_mem = True
-                                    break
+                            # Due to the potentially very large number of inputs on large matrices, adding dependencies to
+                            # all inputs may take a long time. Therefore, we only partially build the dependencies on
+                            # large matrices and output a warning.
+                            # The threshold of 2_250_000 values per matrix is equivalent to multiplying two 1500x1500
+                            # matrices. Experiments showed that multiplying two 1700x1700 matrices requires roughly 10 seconds on an i7-1370P,
+                            # so this threshold should lead to acceptable compile times even on slower processors.
+                            first_factor_total_number_of_values = instr.args[12 * matmul_idx + 3] * instr.args[12 * matmul_idx + 4]
+                            second_factor_total_number_of_values = instr.args[12 * matmul_idx + 4] * instr.args[12 * matmul_idx + 5]
+                            max_dependencies_per_matrix = 1500**2
+                            if first_factor_total_number_of_values > max_dependencies_per_matrix or second_factor_total_number_of_values > max_dependencies_per_matrix:
+                                if block.warn_about_mem and not block.parent.warned_about_mem:
+                                    print('WARNING: Order of memory instructions not preserved due to long vector, errors possible')
+                                    block.parent.warned_about_mem = True
 
+                            # Add dependencies to the first factor.
+                            # If the size of the matrix exceeds the max_dependencies_per_matrix, only a limited number
+                            # of rows will be processed.
+                            for i in range(min(instr.args[12 * matmul_idx + 3], max_dependencies_per_matrix // instr.args[12 * matmul_idx + 4] + 1)):
                                 for k in range(instr.args[12 * matmul_idx + 4]):
                                     first_factor_addr = first_base + \
                                                         first_factor_row_length * first_factor_row_indices[i] + \
@@ -616,7 +625,9 @@ def keep_text_order(inst, n):
                                     handle_mem_access(first_factor_addr, 's', last_mem_read_of, last_mem_write_of)
 
                             # Add dependencies to the second factor.
-                            for k in range(instr.args[12 * matmul_idx + 4]):
+                            # If the size of the matrix exceeds the max_dependencies_per_matrix, only a limited number
+                            # of rows will be processed.
+                            for k in range(min(instr.args[12 * matmul_idx + 4], max_dependencies_per_matrix // instr.args[12 * matmul_idx + 5] + 1)):
                                 if (time.time() - start_time) > 10:
                                     # Abort building the dependencies if that takes too much time.
                                     if block.warn_about_mem and not block.parent.warned_about_mem: