From 4d7b8a09dad64e3e364ed6037719e331d12bae82 Mon Sep 17 00:00:00 2001
From: Vincent Ehrmanntraut <ehrmanntraut@itsec.rwth-aachen.de>
Date: Tue, 11 Jun 2024 09:05:19 +0200
Subject: [PATCH 1/4] Make MATMULSM mergeable

---
 Compiler/allocator.py        |  42 ++++++--
 Compiler/instructions.py     |  52 +++++++---
 Compiler/types.py            |   8 +-
 Processor/Instruction.hpp    |   7 +-
 Processor/Processor.h        |   8 +-
 Processor/Processor.hpp      | 187 +++++++++++++++++++++++++----------
 Programs/Source/test_dot.mpc | 147 +++++++++++++++++++++++++++
 Protocols/Hemi.h             |   2 +-
 Protocols/Hemi.hpp           |  77 +++++++++------
 Protocols/Replicated.h       |   4 +-
 10 files changed, 420 insertions(+), 114 deletions(-)
 create mode 100644 Programs/Source/test_dot.mpc

diff --git a/Compiler/allocator.py b/Compiler/allocator.py
index f2154cabe..bde202793 100644
--- a/Compiler/allocator.py
+++ b/Compiler/allocator.py
@@ -581,6 +581,41 @@ def keep_text_order(inst, n):
                 keep_text_order(instr, n)
             elif isinstance(instr, RawInputInstruction):
                 keep_merged_order(instr, n, RawInputInstruction)
+            elif isinstance(instr, matmulsm):
+                if options.preserve_mem_order:
+                    strict_mem_access(n, last_mem_read, last_mem_write)
+                else:
+                    if instr.indices_values:
+                        # Determine which values get accessed by the MATMULSM instruction and only add the according dependencies.
+                        for matmul_idx in range(len(instr.first_factor_base_addresses)):
+                            first_base = instr.first_factor_base_addresses[matmul_idx]
+                            second_base = instr.second_factor_base_addresses[matmul_idx]
+
+                            first_factor_row_indices = instr.indices_values[4 * matmul_idx]
+                            first_factor_column_indices = instr.indices_values[4 * matmul_idx + 1]
+                            second_factor_row_indices = instr.indices_values[4 * matmul_idx + 2]
+                            second_factor_column_indices = instr.indices_values[4 * matmul_idx + 3]
+
+                            first_factor_row_length = instr.args[12 * matmul_idx + 10]
+                            second_factor_row_length = instr.args[12 * matmul_idx + 11]
+
+                            for i in range(instr.args[12 * matmul_idx + 3]):
+                                for j in range(instr.args[12 * matmul_idx + 5]):
+                                    for k in range(instr.args[12 * matmul_idx + 4]):
+                                        first_factor_addr = first_base + \
+                                                            first_factor_row_length * first_factor_row_indices[i] + \
+                                                            first_factor_column_indices[k]
+                                        handle_mem_access(first_factor_addr, 's', last_mem_read_of, last_mem_write_of)
+
+                                        second_factor_addr = second_base + \
+                                                             second_factor_row_length * second_factor_row_indices[k] + \
+                                                             second_factor_column_indices[j]
+                                        handle_mem_access(second_factor_addr, 's', last_mem_read_of, last_mem_write_of)
+                    else:
+                        # If the accessed values cannot be determined, be cautious I guess.
+                        for i in last_mem_write_of.values():
+                            for j in i:
+                                add_edge(j, n)
 
             if isinstance(instr, merge_classes):
                 open_nodes.add(n)
@@ -622,13 +657,6 @@ def keep_text_order(inst, n):
                     strict_mem_access(n, scope.write, scope.read)
                 if not options.preserve_mem_order:
                     mem_access(n, instr, last_mem_write_of, last_mem_read_of)
-            elif isinstance(instr, matmulsm):
-                if options.preserve_mem_order:
-                    strict_mem_access(n, last_mem_read, last_mem_write)
-                else:
-                    for i in last_mem_write_of.values():
-                        for j in i:
-                            add_edge(j, n)
             # keep I/O instructions in order
             elif isinstance(instr, IOInstruction):
                 if last_print_str is not None:
diff --git a/Compiler/instructions.py b/Compiler/instructions.py
index 9e7b23e7d..62c56533f 100644
--- a/Compiler/instructions.py
+++ b/Compiler/instructions.py
@@ -2484,7 +2484,7 @@ def get_repeat(self):
         return sum(reduce(operator.mul, self.args[i + 3:i + 6])
                    for i in range(0, len(self.args), 6))
 
-class matmulsm(matmul_base):
+class matmulsm(matmul_base, base.Mergeable):
     """ Secret matrix multiplication reading directly from memory.
 
     :param: result (sint vector in row-first order)
@@ -2494,26 +2494,48 @@ class matmulsm(matmul_base):
     :param: number of columns in first factor and rows in second factor (int)
     :param: number of columns in second factor and result (int)
     :param: rows of first factor to use (regint vector, length as number of rows in first factor)
-    :param: columns of first factor to use (regint vector, length below)
-    :param: rows of second factor to use (regint vector, length below)
-    :param: columns of second factor to use (regint vector, length below)
-    :param: number of columns of first / rows of second factor to use (int)
-    :param: number of columns of second factor to use (int)
+    :param: columns of first factor to use (regint vector, length as number of columns in the first factor)
+    :param: rows of second factor to use (regint vector, length as number of columns in the first factor)
+    :param: columns of second factor to use (regint vector, length as number of columns in the second factor)
+    :param: total number of columns in the first factor, equal to used number of columns when all columns are used (int)
+    :param: total number of columns in the second factor, equal to used number of columns when all columns are used (int)
     """
     code = base.opcodes['MATMULSM']
-    arg_format = ['sw','ci','ci','int','int','int','ci','ci','ci','ci',
-                  'int','int']
-
-    def __init__(self, *args, **kwargs):
+    arg_format = itertools.cycle(['sw','ci','ci','int','int','int','ci','ci','ci','ci',
+                                  'int','int'])
+
+    first_factor_base_addresses: list[int] | None
+    second_factor_base_addresses: list[int] | None
+    indices_values: list[list[int]] | None
+
+    def __init__(self, *args,
+                 first_factor_base_addresses: list[int] | None = None,
+                 second_factor_base_addresses: list[int] | None = None,
+                 indices_values: list[int] | None = None,
+                 **kwargs):
         matmul_base.__init__(self, *args, **kwargs)
-        for i in range(2):
-            assert args[6 + i].size == args[3 + i]
-        for i in range(2):
-            assert args[8 + i].size == args[4 + i]
+        for matmul_index in range(len(args) // 12):
+            for i in range(2):
+                assert args[12 * matmul_index + 6 + i].size == args[12 * matmul_index + 3 + i]
+            for i in range(2):
+                assert args[12 * matmul_index + 8 + i].size == args[12 * matmul_index + 4 + i]
+
+        # These are used to reconstruct that accessed memory addresses in the allocator.
+        self.first_factor_base_addresses = first_factor_base_addresses
+        self.second_factor_base_addresses = second_factor_base_addresses
+        self.indices_values = indices_values
+
+        assert len(first_factor_base_addresses) == len(second_factor_base_addresses)
+        assert len(indices_values) == 4 * len(first_factor_base_addresses)
 
     def add_usage(self, req_node):
         super(matmulsm, self).add_usage(req_node)
-        req_node.increment(('matmul', tuple(self.args[3:6])), 1)
+        for i in range(0, len(self.args), 12):
+            req_node.increment(('matmul', (self.args[i + 3], self.args[i + 4], self.args[i + 5])), 1)
+
+    def get_repeat(self):
+        return sum(reduce(operator.mul, self.args[i + 3:i + 6])
+                   for i in range(0, len(self.args), 12))
 
 class conv2ds(base.DataInstruction, base.VarArgsInstruction, base.Mergeable):
     """ Secret 2D convolution.
diff --git a/Compiler/types.py b/Compiler/types.py
index 007bcd8bd..f5cc75b58 100644
--- a/Compiler/types.py
+++ b/Compiler/types.py
@@ -2668,12 +2668,16 @@ def store_in_mem(self, address):
         self._store_in_mem(address, stms, stmsi)
 
     @classmethod
-    def direct_matrix_mul(cls, A, B, n, m, l, reduce=None, indices=None):
+    def direct_matrix_mul(cls, A, B, n, m, l, reduce=None, indices=None, indices_values=None):
         if indices is None:
             indices = [regint.inc(i) for i in (n, m, m, l)]
+            indices_values = [list(range(i)) for i in (n, m, m, l)]
         res = cls(size=indices[0].size * indices[3].size)
         matmulsm(res, regint(A), regint(B), len(indices[0]), len(indices[1]),
-                 len(indices[3]), *(list(indices) + [m, l]))
+                 len(indices[3]), *(list(indices) + [m, l]),
+                 first_factor_base_addresses=[A],
+                 second_factor_base_addresses=[B],
+                 indices_values=indices_values)
         return res
 
     @vectorize_init
diff --git a/Processor/Instruction.hpp b/Processor/Instruction.hpp
index ef21e3728..90f4db526 100644
--- a/Processor/Instruction.hpp
+++ b/Processor/Instruction.hpp
@@ -323,8 +323,8 @@ void BaseInstruction::parse_operands(istream& s, int pos, int file_pos)
         get_vector(num_var_args, start, s);
         break;
       case MATMULSM:
-        get_ints(r, s, 3);
-        get_vector(9, start, s);
+        num_var_args = get_int(s);
+        get_vector(num_var_args, start, s);
         break;
 
       // read from file, input is opcode num_args, 
@@ -1117,8 +1117,7 @@ inline void Instruction::execute(Processor<sint, sgf2n>& Proc) const
         Proc.Procp.matmuls(Proc.Procp.get_S(), *this);
         return;
       case MATMULSM:
-        Proc.Procp.protocol.matmulsm(Proc.Procp, Proc.machine.Mp.MS, *this,
-            Proc.read_Ci(r[1]), Proc.read_Ci(r[2]));
+        Proc.Procp.protocol.matmulsm(Proc.Procp, Proc.machine.Mp.MS, *this);
         return;
       case CONV2DS:
         Proc.Procp.protocol.conv2ds(Proc.Procp, *this);
diff --git a/Processor/Processor.h b/Processor/Processor.h
index 9b4757f4e..08f4cd269 100644
--- a/Processor/Processor.h
+++ b/Processor/Processor.h
@@ -77,8 +77,12 @@ class SubProcessor
   void mulrs(const vector<int>& reg);
   void dotprods(const vector<int>& reg, int size);
   void matmuls(const vector<T>& source, const Instruction& instruction);
-  void matmulsm(const MemoryPart<T>& source, const Instruction& instruction, size_t a,
-      size_t b);
+  void matmulsm(const MemoryPart<T>& source, const Instruction& instruction);
+
+  void matmulsm_finalize_batch(vector<int>::const_iterator startMatmul, int startI, int startJ,
+                               vector<int>::const_iterator endMatmul,
+                               int endI, int endJ);
+
   void conv2ds(const Instruction& instruction);
 
   void secure_shuffle(const Instruction& instruction);
diff --git a/Processor/Processor.hpp b/Processor/Processor.hpp
index d468a6302..aab2ba9d0 100644
--- a/Processor/Processor.hpp
+++ b/Processor/Processor.hpp
@@ -601,73 +601,156 @@ void SubProcessor<T>::matmuls(const vector<T>& source,
     }
 }
 
+
 template<class T>
 void SubProcessor<T>::matmulsm(const MemoryPart<T>& source,
-        const Instruction& instruction, size_t a, size_t b)
+        const Instruction& instruction)
 {
-    auto& dim = instruction.get_start();
-    auto C = S.begin() + (instruction.get_r(0));
-    assert(C + dim[0] * dim[2] <= S.end());
     assert(Proc);
 
-    int base = 0;
-    int base2 = 0;
+    auto& start = instruction.get_start();
+
+    auto batchStartMatrix = start.begin();
+    int batchStartI = 0;
+    int batchStartJ = 0;
+
+    size_t sourceSize = source.size();
+    const T* sourceData = source.data();
+
     protocol.init_dotprod();
-    for (int i = 0; i < dim[0]; i++)
-    {
-        auto ii = Proc->get_Ci().at(dim[3] + i).get();
-        for (int j = 0; j < dim[2]; j++)
-        {
-#ifdef DEBUG_MATMULSM
-            cerr << "matmulsm prep " << i << " " << j << endl;
-#endif
-            matmulsm_prep(ii, j, source, dim, a, b);
-            if (protocol.get_buffer_size() > OnlineOptions::singleton.batch_size)
-            {
-#ifdef DEBUG_MATMULSM
-                cerr << "matmulsm round " << protocol.get_buffer_size() << endl;
+    for (auto matmulArgs = start.begin(); matmulArgs < start.end(); matmulArgs += 12) {
+        auto output = S.begin() + matmulArgs[0];
+        size_t firstFactorBase  = Proc->get_Ci().at(matmulArgs[1]).get();
+        size_t secondFactorBase = Proc->get_Ci().at(matmulArgs[2]).get();
+        auto resultNumberOfRows = matmulArgs[3];
+        auto usedNumberOfFirstFactorColumns = matmulArgs[4];
+        auto resultNumberOfColumns = matmulArgs[5];
+        auto firstFactorTotalNumberOfColumns = matmulArgs[10];
+        auto secondFactorTotalNumberOfColumns = matmulArgs[11];
+
+        assert(output + resultNumberOfRows * resultNumberOfColumns <= S.end());
+
+        for (int i = 0; i < resultNumberOfRows; i += 1) {
+            auto actualFirstFactorRow = Proc->get_Ci().at(matmulArgs[6] + i).get();
+
+            for (int j = 0; j < resultNumberOfColumns; j += 1) {
+                auto actualSecondFactorColumn = Proc->get_Ci().at(matmulArgs[9] + j).get();
+
+#ifdef MATMULSM_DEBUG
+                cout << "Preparing " << i << "," << j << "(buffer size: " << protocol.get_buffer_size() << ")" << endl;
 #endif
-                protocol.exchange();
-                if (base < i)
-                    for (int l = base2; l < dim[2]; l++)
-                        matmulsm_finalize(base, l, dim, C);
-                for (int k = base + 1; k < i; k++)
-                    for (int l = 0; l < dim[2]; l++)
-                        matmulsm_finalize(k, l, dim, C);
-                for (int l = base < i ? 0 : base2; l <= j; l++)
-                    matmulsm_finalize(i, l, dim, C);
-                base = i;
-                base2 = j + 1;
-                protocol.init_dotprod();
+
+                for (int k = 0; k < usedNumberOfFirstFactorColumns; k += 1) {
+                    auto actualFirstFactorColumn = Proc->get_Ci().at(matmulArgs[7] + k).get();
+                    auto actualSecondFactorRow = Proc->get_Ci().at(matmulArgs[8] + k).get();
+
+                    auto firstAddress = firstFactorBase + actualFirstFactorRow * firstFactorTotalNumberOfColumns + actualFirstFactorColumn;
+                    auto secondAddress = secondFactorBase + actualSecondFactorRow * secondFactorTotalNumberOfColumns + actualSecondFactorColumn;
+
+                    assert(firstAddress < sourceSize);
+                    assert(secondAddress < sourceSize);
+
+                    protocol.prepare_dotprod(sourceData[firstAddress], sourceData[secondAddress]);
+                }
+                protocol.next_dotprod();
+
+                if (protocol.get_buffer_size() > OnlineOptions::singleton.batch_size) {
+                    protocol.exchange();
+
+                    matmulsm_finalize_batch(batchStartMatrix, batchStartI, batchStartJ,
+                        matmulArgs, i, j);
+                    batchStartMatrix = matmulArgs;
+                    batchStartI = i;
+                    batchStartJ = j + 1;
+
+                    protocol.init_dotprod();
+                }
             }
         }
     }
+
     protocol.exchange();
-    for (int j = base2; j < dim[2]; j++)
-        matmulsm_finalize(base, j, dim, C);
-    for (int i = base + 1; i < dim[0]; i++)
-        for (int j = 0; j < dim[2]; j++)
-            matmulsm_finalize(i, j, dim, C);
+    auto lastMatmulsArgs = start.end() - 12;
+    auto lastMatrixRows = lastMatmulsArgs[3];
+    auto lastMatrixColumns = lastMatmulsArgs[5];
+    matmulsm_finalize_batch(batchStartMatrix, batchStartI, batchStartJ,
+                        lastMatmulsArgs, lastMatrixRows - 1, lastMatrixColumns - 1);
 }
 
 template<class T>
-void SubProcessor<T>::matmulsm_prep(int ii, int j, const MemoryPart<T>& source,
-        const vector<int>& dim, size_t a, size_t b)
-{
-    auto jj = Proc->get_Ci().at(dim[6] + j).get();
-    const T* base = source.data();
-    size_t size = source.size();
-    for (int k = 0; k < dim[1]; k++)
-    {
-        auto kk = Proc->get_Ci().at(dim[4] + k).get();
-        auto ll = Proc->get_Ci().at(dim[5] + k).get();
-        auto aa = a + ii * dim[7] + kk;
-        auto bb = b + ll * dim[8] + jj;
-        assert(aa < size);
-        assert(bb < size);
-        protocol.prepare_dotprod(base[aa], base[bb]);
+void SubProcessor<T>::matmulsm_finalize_batch(vector<int>::const_iterator startMatmul, int startI, int startJ,
+    vector<int>::const_iterator endMatmul, int endI, int endJ) {
+
+    for (auto matmulArgs = startMatmul; matmulArgs <= endMatmul; matmulArgs += 12) {
+        auto output = S.begin() + matmulArgs[0];
+        auto resultNumberOfRows = matmulArgs[3];
+        auto usedNumberOfFirstFactorColumns = matmulArgs[4];
+        auto resultNumberOfColumns = matmulArgs[5];
+
+        assert(output + resultNumberOfRows * resultNumberOfColumns <= S.end());
+
+        // Finish the first unfinished row in the current matrix.
+        int firstRowEndJ = resultNumberOfColumns - 1;
+        if (matmulArgs == endMatmul && startI == endI) // For the case that the batch covers only a part of the first row of current matrix or only part of a single row.
+            firstRowEndJ = endJ;
+            #ifdef MATMULSM_DEBUG
+                    cout << "Batch is in single row " << endJ << endl;
+            #endif
+        for (int j = startJ; j <= firstRowEndJ; j += 1) {
+#ifdef MATMULSM_DEBUG
+            cout << "Finalizing (first row) " << startI << "," << j << endl;
+#endif
+            *(output + startI * resultNumberOfColumns + j) = protocol.finalize_dotprod(usedNumberOfFirstFactorColumns);
+        }
+        if (firstRowEndJ == resultNumberOfColumns - 1) {
+            startJ = 0;
+            startI += 1;
+        }
+        else {
+            // The whole batch covers only a part of a single row.
+            startJ = endJ + 1;
+        }
+
+        // Determine the point up until which the batch runs in the current matrix.
+        int currentMatrixEndI = resultNumberOfRows - 1;
+        int currentMatrixEndJ = resultNumberOfColumns - 1;
+        if (matmulArgs == endMatmul) {
+            currentMatrixEndI = endI;
+            currentMatrixEndJ = endJ;
+        }
+
+        // Finish the rows that always are complete, i.e., the second to the "second to last" row.
+        for (; startI <= currentMatrixEndI - 1; startI += 1) {
+            for (int j = 0; j < resultNumberOfColumns; j += 1) {
+#ifdef MATMULSM_DEBUG
+                cout << "Finalizing (main part) " << startI << "," << j << endl;
+#endif
+                *(output + startI * resultNumberOfColumns + j) = protocol.finalize_dotprod(usedNumberOfFirstFactorColumns);
+            }
+        }
+
+        // (Partially) finish the last row.
+        if (startI == currentMatrixEndI) {
+            for (; startJ <= currentMatrixEndJ; startJ += 1) {
+#ifdef MATMULSM_DEBUG
+                cout << "Finalizing (last row) " << startI << "," << startJ << endl;
+#endif
+                *(output + startI * resultNumberOfColumns + startJ) = protocol.finalize_dotprod(usedNumberOfFirstFactorColumns);
+            }
+        }
+        else {
+#ifdef MATMULSM_DEBUG
+            // This happens when there is only one row.
+            cout << "Skipping final row of matrix because it was handled previously." << endl;
+#endif
+        }
+
+        if (matmulArgs < endMatmul) {
+            // Reset startI and startJ to the beginning of the matrix.
+            startI = 0;
+            startJ = 0;
+        }
     }
-    protocol.next_dotprod();
 }
 
 template<class T>
diff --git a/Programs/Source/test_dot.mpc b/Programs/Source/test_dot.mpc
new file mode 100644
index 000000000..6063ca2e3
--- /dev/null
+++ b/Programs/Source/test_dot.mpc
@@ -0,0 +1,147 @@
+a = Array.create_from([sint(1), sint(2), sint(3), sint(4)])
+b = Array.create_from([sint(3), sint(2), sint(1)])
+
+c = Matrix.create_from([
+    [sint(1), sint(2), sint(3)],
+    [sint(4), sint(5), sint(6)],
+    [sint(7), sint(8), sint(9)],
+    [sint(10), sint(11), sint(12)]
+])
+
+d = Matrix.create_from([
+    [sint(12), sint(11), sint(10), sint(9)],
+    [sint(8), sint(7), sint(6), sint(5)],
+    [sint(4), sint(3), sint(2), sint(1)]
+])
+
+
+def test_array(expected: list[int], actual: Array) -> None:
+    actual = actual.reveal()
+    expected = Array.create_from([cint(x) for x in expected])
+    @for_range(len(expected))
+    def _(i: cint) -> None:
+        @if_(actual[i] != expected[i])
+        def fail():
+            print_ln("Unexpected entry at index %s", i)
+            print_ln("Expected:")
+            expected.print_reveal_nested()
+            print_ln("Actual:")
+            actual.print_reveal_nested()
+
+            crash()
+
+
+def test_matrix(expected: list[list[int]], actual: Matrix) -> None:
+    actual = actual.reveal()
+    expected = Matrix.create_from([[cint(x) for x in row] for row in expected])
+    @for_range(len(expected))
+    def outer(i: cint) -> None:
+
+        @for_range(len(expected[0]))
+        def inner(j: cint) -> None:
+            @if_(actual[i][j] != expected[i][j])
+            def fail():
+                print_ln("Unexpected entry at index %s,%s", i, j)
+                print_ln("Expected:")
+                expected.print_reveal_nested()
+                print_ln("Actual:")
+                actual.print_reveal_nested()
+
+                crash()
+
+break_point()
+def hacky_array_dot_matrix(arr: Array, mat: Matrix) -> Array:
+    # Arrays sadly do not have a dot function, therefore the array is converted into a 1 times n Matrix by copying memory addresses.
+    tmp = sint.Matrix(rows=1, columns=len(arr), address=arr.address)
+    result = tmp.dot(mat)
+    return sint.Array(mat.shape[1], result.address)
+
+start_timer(3)
+
+e3 = hacky_array_dot_matrix(a, c)
+# b[0] = e3[0]
+f3 = hacky_array_dot_matrix(b, d)
+
+stop_timer(3)
+
+e3 = e3.reveal()
+f3 = f3.reveal()
+
+e3.print_reveal_nested()
+f3.print_reveal_nested()
+
+test_array([70, 80, 90], e3)
+test_array([56, 50, 44, 38], f3)
+
+start_timer(4)
+
+e4 = hacky_array_dot_matrix(a, c)
+b[-1] = e4[0]
+f4 = hacky_array_dot_matrix(b, d)
+
+stop_timer(4)
+
+test_array([70, 80, 90], e4)
+test_array([332, 257, 182, 107], f4)
+
+f4.print_reveal_nested()
+
+# TODO: Crashes
+
+
+start_timer(5)
+g = c.dot(d)
+stop_timer(5)
+
+test_matrix([
+    [ 40,  34,  28,  22],
+    [112,  97,  82,  67],
+    [184, 160, 136, 112],
+    [256, 223, 190, 157]
+], g)
+g.print_reveal_nested()
+
+
+# Big matrix tests.
+# These are intended to test matrix multiplications that require multiple batches.
+
+def identity(size: int) -> Matrix:
+    result = sint.Matrix(rows=size, columns=size)
+    result.assign_all(0)
+    for i in range(size):
+        result[i][i] = 1
+    return result
+
+
+def counting_matrix(rows: int, columns: int) -> Matrix:
+    result = sint.Matrix(rows, columns)
+    @for_range(rows)
+    def outer(i: cint) -> None:
+        @for_range(columns)
+        def inner(j: cint) -> None:
+            result[i][j] = i * columns + j
+    return result
+
+
+def clear_counting_matrix(rows: int, columns: int) -> list[list[int]]:
+    return [list(range(i * columns, (i + 1) * columns)) for i in range(rows)]
+
+
+# Single matrix multiplication requiring multiple batches.
+a = counting_matrix(20, 20)
+b = identity(20)
+
+start_timer(6)
+c = a * b
+stop_timer(6)
+
+test_matrix(clear_counting_matrix(20, 20), c)
+
+# Multiple matrix multiplications requiring multiple batches.
+start_timer(7)
+d = a * b
+e = c * b
+stop_timer(7)
+
+test_matrix(clear_counting_matrix(20, 20), d)
+test_matrix(clear_counting_matrix(20, 20), e)
diff --git a/Protocols/Hemi.h b/Protocols/Hemi.h
index 2073eac26..2bceb9f31 100644
--- a/Protocols/Hemi.h
+++ b/Protocols/Hemi.h
@@ -34,7 +34,7 @@ class Hemi : public T::BasicProtocol
             SubProcessor<T>& processor);
 
     void matmulsm(SubProcessor<T>& processor, MemoryPart<T>& source,
-            const Instruction& instruction, int a, int b);
+            const Instruction& instruction);
     void conv2ds(SubProcessor<T>& processor, const Instruction& instruction);
 };
 
diff --git a/Protocols/Hemi.hpp b/Protocols/Hemi.hpp
index b232bc42d..807f5fc79 100644
--- a/Protocols/Hemi.hpp
+++ b/Protocols/Hemi.hpp
@@ -34,51 +34,70 @@ typename T::MatrixPrep& Hemi<T>::get_matrix_prep(const array<int, 3>& dims,
 
 template<class T>
 void Hemi<T>::matmulsm(SubProcessor<T>& processor, MemoryPart<T>& source,
-        const Instruction& instruction, int a, int b)
+        const Instruction& instruction)
 {
     if (HemiOptions::singleton.plain_matmul
             or not OnlineOptions::singleton.live_prep)
     {
-        processor.matmulsm(source, instruction, a, b);
+        processor.matmulsm(source, instruction);
         return;
     }
 
-    auto& dim = instruction.get_start();
-    auto& S = processor.get_S();
-    auto C = S.begin() + (instruction.get_r(0));
-    assert(C + dim[0] * dim[2] <= S.end());
+    // Perform the matrix multiplications in sequence.
+    // They are not merged into one communication round since that would require multiple matrix_preps to
+    // merge rounds.
+    // An improvement might be to merge the communication of multiple matrices with the same dimension into one round,
+    // which is not implemented yet.
     auto Proc = processor.Proc;
     assert(Proc);
+    auto& S = processor.get_S();
+    auto& start = instruction.get_start();
+
+    for (auto matmulArgs = start.begin(); matmulArgs < start.end(); matmulArgs += 12) {
+        auto C = S.begin() + matmulArgs[0];
+        size_t firstFactorBase  = Proc->get_Ci().at(matmulArgs[1]).get();
+        size_t secondFactorBase = Proc->get_Ci().at(matmulArgs[2]).get();
+        auto resultNumberOfRows = matmulArgs[3];
+        auto usedNumberOfFirstFactorColumns = matmulArgs[4];
+        auto resultNumberOfColumns = matmulArgs[5];
+        auto firstFactorTotalNumberOfColumns = matmulArgs[10];
+        auto secondFactorTotalNumberOfColumns = matmulArgs[11];
+
+        assert(C + resultNumberOfRows * resultNumberOfColumns <= S.end());
+
+        ShareMatrix<T> A(resultNumberOfRows, usedNumberOfFirstFactorColumns), B(usedNumberOfFirstFactorColumns, resultNumberOfColumns);
+        if (not T::real_shares(processor.P))
+        {
+            matrix_multiply(A, B, processor);
+            return;
+        }
 
-    ShareMatrix<T> A(dim[0], dim[1]), B(dim[1], dim[2]);
+        for (int i = 0; i < resultNumberOfRows; i++) {
+            auto actualFirstFactorRow = Proc->get_Ci().at(matmulArgs[6] + i).get();
 
-    if (not T::real_shares(processor.P))
-    {
-        matrix_multiply(A, B, processor);
-        return;
-    }
-
-    for (int i = 0; i < dim[0]; i++)
-        for (int k = 0; k < dim[1]; k++)
-        {
-            auto kk = Proc->get_Ci().at(dim[4] + k).get();
-            auto ii = Proc->get_Ci().at(dim[3] + i).get();
-            A.entries.v.push_back(source.at(a + ii * dim[7] + kk));
+            for (int k = 0; k < usedNumberOfFirstFactorColumns; k++)
+            {
+                auto actualFirstFactorColumn = Proc->get_Ci().at(matmulArgs[7] + k).get();
+                A.entries.v.push_back(source.at(firstFactorBase + actualFirstFactorRow * firstFactorTotalNumberOfColumns + actualFirstFactorColumn));
+            }
         }
 
-    for (int k = 0; k < dim[1]; k++)
-        for (int j = 0; j < dim[2]; j++)
-        {
-            auto jj = Proc->get_Ci().at(dim[6] + j).get();
-            auto ll = Proc->get_Ci().at(dim[5] + k).get();
-            B.entries.v.push_back(source.at(b + ll * dim[8] + jj));
+
+        for (int k = 0; k < usedNumberOfFirstFactorColumns; k++) {
+            auto actualSecondFactorRow = Proc->get_Ci().at(matmulArgs[8] + k).get();
+            for (int j = 0; j < resultNumberOfColumns; j++)
+            {
+                auto actualSecondFactorColumn = Proc->get_Ci().at(matmulArgs[9] + j).get();
+                B.entries.v.push_back(source.at(secondFactorBase + actualSecondFactorRow * secondFactorTotalNumberOfColumns + actualSecondFactorColumn));
+            }
         }
 
-    auto res = matrix_multiply(A, B, processor);
+        auto res = matrix_multiply(A, B, processor);
 
-    for (int i = 0; i < dim[0]; i++)
-        for (int j = 0; j < dim[2]; j++)
-            *(C + i * dim[2] + j) = res[{i, j}];
+        for (int i = 0; i < resultNumberOfRows; i++)
+            for (int j = 0; j < resultNumberOfColumns; j++)
+                *(C + i * resultNumberOfColumns + j) = res[{i, j}];
+    }
 }
 
 template<class T>
diff --git a/Protocols/Replicated.h b/Protocols/Replicated.h
index 4fb5a6317..1f1176ff7 100644
--- a/Protocols/Replicated.h
+++ b/Protocols/Replicated.h
@@ -111,8 +111,8 @@ class ProtocolBase
 
     template<int = 0>
     void matmulsm(SubProcessor<T> & proc, MemoryPart<T>& source,
-            const Instruction& instruction, int a, int b)
-    { proc.matmulsm(source, instruction, a, b); }
+            const Instruction& instruction)
+    { proc.matmulsm(source, instruction); }
 
     template<int = 0>
     void conv2ds(SubProcessor<T>& proc, const Instruction& instruction)

From a490c391beb9eda3fe60a59386e82fd94a0fd978 Mon Sep 17 00:00:00 2001
From: Vincent Ehrmanntraut <ehrmanntraut@itsec.rwth-aachen.de>
Date: Mon, 24 Jun 2024 11:42:18 +0200
Subject: [PATCH 2/4] Fix multithreaded MATMULSM, direct_mul. Should also now
 work with Python 3.8

---
 Compiler/allocator.py        |  2 +-
 Compiler/instructions.py     | 16 +++++++---------
 Compiler/types.py            | 12 ++++++++++--
 Programs/Source/test_dot.mpc | 35 ++++++++++++++++++++++++-----------
 4 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/Compiler/allocator.py b/Compiler/allocator.py
index bde202793..3ff2c889e 100644
--- a/Compiler/allocator.py
+++ b/Compiler/allocator.py
@@ -585,7 +585,7 @@ def keep_text_order(inst, n):
                 if options.preserve_mem_order:
                     strict_mem_access(n, last_mem_read, last_mem_write)
                 else:
-                    if instr.indices_values:
+                    if instr.indices_values is not None and instr.first_factor_base_addresses is not None and instr.second_factor_base_addresses is not None:
                         # Determine which values get accessed by the MATMULSM instruction and only add the according dependencies.
                         for matmul_idx in range(len(instr.first_factor_base_addresses)):
                             first_base = instr.first_factor_base_addresses[matmul_idx]
diff --git a/Compiler/instructions.py b/Compiler/instructions.py
index 62c56533f..230f62539 100644
--- a/Compiler/instructions.py
+++ b/Compiler/instructions.py
@@ -2504,14 +2504,10 @@ class matmulsm(matmul_base, base.Mergeable):
     arg_format = itertools.cycle(['sw','ci','ci','int','int','int','ci','ci','ci','ci',
                                   'int','int'])
 
-    first_factor_base_addresses: list[int] | None
-    second_factor_base_addresses: list[int] | None
-    indices_values: list[list[int]] | None
-
     def __init__(self, *args,
-                 first_factor_base_addresses: list[int] | None = None,
-                 second_factor_base_addresses: list[int] | None = None,
-                 indices_values: list[int] | None = None,
+                 first_factor_base_addresses=None,
+                 second_factor_base_addresses=None,
+                 indices_values=None,
                  **kwargs):
         matmul_base.__init__(self, *args, **kwargs)
         for matmul_index in range(len(args) // 12):
@@ -2525,8 +2521,10 @@ def __init__(self, *args,
         self.second_factor_base_addresses = second_factor_base_addresses
         self.indices_values = indices_values
 
-        assert len(first_factor_base_addresses) == len(second_factor_base_addresses)
-        assert len(indices_values) == 4 * len(first_factor_base_addresses)
+        if first_factor_base_addresses is not None:
+            assert len(first_factor_base_addresses) == len(second_factor_base_addresses)
+            if indices_values is not None:
+                assert len(indices_values) == 4 * len(first_factor_base_addresses)
 
     def add_usage(self, req_node):
         super(matmulsm, self).add_usage(req_node)
diff --git a/Compiler/types.py b/Compiler/types.py
index f5cc75b58..1432fd119 100644
--- a/Compiler/types.py
+++ b/Compiler/types.py
@@ -2673,10 +2673,18 @@ def direct_matrix_mul(cls, A, B, n, m, l, reduce=None, indices=None, indices_val
             indices = [regint.inc(i) for i in (n, m, m, l)]
             indices_values = [list(range(i)) for i in (n, m, m, l)]
         res = cls(size=indices[0].size * indices[3].size)
+
+        if isinstance(A, int) and isinstance(B, int):
+            first_factor_base_addresses = [A]
+            second_factor_base_addresses = [B]
+        else:
+            first_factor_base_addresses = None
+            second_factor_base_addresses = None
+
         matmulsm(res, regint(A), regint(B), len(indices[0]), len(indices[1]),
                  len(indices[3]), *(list(indices) + [m, l]),
-                 first_factor_base_addresses=[A],
-                 second_factor_base_addresses=[B],
+                 first_factor_base_addresses=first_factor_base_addresses,
+                 second_factor_base_addresses=second_factor_base_addresses,
                  indices_values=indices_values)
         return res
 
diff --git a/Programs/Source/test_dot.mpc b/Programs/Source/test_dot.mpc
index 6063ca2e3..148caa0d5 100644
--- a/Programs/Source/test_dot.mpc
+++ b/Programs/Source/test_dot.mpc
@@ -15,11 +15,11 @@ d = Matrix.create_from([
 ])
 
 
-def test_array(expected: list[int], actual: Array) -> None:
+def test_array(expected, actual):
     actual = actual.reveal()
     expected = Array.create_from([cint(x) for x in expected])
     @for_range(len(expected))
-    def _(i: cint) -> None:
+    def _(i):
         @if_(actual[i] != expected[i])
         def fail():
             print_ln("Unexpected entry at index %s", i)
@@ -31,14 +31,14 @@ def test_array(expected: list[int], actual: Array) -> None:
             crash()
 
 
-def test_matrix(expected: list[list[int]], actual: Matrix) -> None:
+def test_matrix(expected, actual):
     actual = actual.reveal()
     expected = Matrix.create_from([[cint(x) for x in row] for row in expected])
     @for_range(len(expected))
-    def outer(i: cint) -> None:
+    def outer(i):
 
         @for_range(len(expected[0]))
-        def inner(j: cint) -> None:
+        def inner(j):
             @if_(actual[i][j] != expected[i][j])
             def fail():
                 print_ln("Unexpected entry at index %s,%s", i, j)
@@ -50,7 +50,7 @@ def test_matrix(expected: list[list[int]], actual: Matrix) -> None:
                 crash()
 
 break_point()
-def hacky_array_dot_matrix(arr: Array, mat: Matrix) -> Array:
+def hacky_array_dot_matrix(arr, mat):
     # Arrays sadly do not have a dot function, therefore the array is converted into a 1 times n Matrix by copying memory addresses.
     tmp = sint.Matrix(rows=1, columns=len(arr), address=arr.address)
     result = tmp.dot(mat)
@@ -105,7 +105,7 @@ g.print_reveal_nested()
 # Big matrix tests.
 # These are intended to test matrix multiplications that require multiple batches.
 
-def identity(size: int) -> Matrix:
+def identity(size):
     result = sint.Matrix(rows=size, columns=size)
     result.assign_all(0)
     for i in range(size):
@@ -113,17 +113,17 @@ def identity(size: int) -> Matrix:
     return result
 
 
-def counting_matrix(rows: int, columns: int) -> Matrix:
+def counting_matrix(rows, columns):
     result = sint.Matrix(rows, columns)
     @for_range(rows)
-    def outer(i: cint) -> None:
+    def outer(i):
         @for_range(columns)
-        def inner(j: cint) -> None:
+        def inner(j):
             result[i][j] = i * columns + j
     return result
 
 
-def clear_counting_matrix(rows: int, columns: int) -> list[list[int]]:
+def clear_counting_matrix(rows, columns):
     return [list(range(i * columns, (i + 1) * columns)) for i in range(rows)]
 
 
@@ -145,3 +145,16 @@ stop_timer(7)
 
 test_matrix(clear_counting_matrix(20, 20), d)
 test_matrix(clear_counting_matrix(20, 20), e)
+
+
+start_timer(8)
+d = a.dot(b, n_threads=2)
+stop_timer(8)
+
+test_matrix(clear_counting_matrix(20, 20), d)
+
+start_timer(9)
+M = sint.Matrix(10, 10)
+M.direct_mul(M, indices=[regint(0), regint.inc(10), regint.inc(10),
+                         regint(0)])
+stop_timer(9)

From aa2441ca08c9f0af1bf2a29cb3c0fc6c18e731f1 Mon Sep 17 00:00:00 2001
From: Vincent Ehrmanntraut <ehrmanntraut@itsec.rwth-aachen.de>
Date: Tue, 25 Jun 2024 09:15:35 +0200
Subject: [PATCH 3/4] Optimize allocator, add timeout for building the
 dependency graph

---
 Compiler/allocator.py        | 38 ++++++++++++++++++++++++++----------
 Programs/Source/test_dot.mpc |  5 +++++
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/Compiler/allocator.py b/Compiler/allocator.py
index 3ff2c889e..05b6dbccd 100644
--- a/Compiler/allocator.py
+++ b/Compiler/allocator.py
@@ -588,6 +588,7 @@ def keep_text_order(inst, n):
                     if instr.indices_values is not None and instr.first_factor_base_addresses is not None and instr.second_factor_base_addresses is not None:
                         # Determine which values get accessed by the MATMULSM instruction and only add the according dependencies.
                         for matmul_idx in range(len(instr.first_factor_base_addresses)):
+                            start_time = time.time()
                             first_base = instr.first_factor_base_addresses[matmul_idx]
                             second_base = instr.second_factor_base_addresses[matmul_idx]
 
@@ -599,18 +600,35 @@ def keep_text_order(inst, n):
                             first_factor_row_length = instr.args[12 * matmul_idx + 10]
                             second_factor_row_length = instr.args[12 * matmul_idx + 11]
 
+                            # Add dependencies to the first factor.
                             for i in range(instr.args[12 * matmul_idx + 3]):
+                                if (time.time() - start_time) > 10:
+                                    # Abort building the dependencies if that takes too much time.
+                                    if block.warn_about_mem and not block.parent.warned_about_mem:
+                                        print('WARNING: Order of memory instructions not preserved due to long vector, errors possible')
+                                        block.parent.warned_about_mem = True
+                                    break
+
+                                for k in range(instr.args[12 * matmul_idx + 4]):
+                                    first_factor_addr = first_base + \
+                                                        first_factor_row_length * first_factor_row_indices[i] + \
+                                                        first_factor_column_indices[k]
+                                    handle_mem_access(first_factor_addr, 's', last_mem_read_of, last_mem_write_of)
+
+                            # Add dependencies to the second factor.
+                            for k in range(instr.args[12 * matmul_idx + 4]):
+                                if (time.time() - start_time) > 10:
+                                    # Abort building the dependencies if that takes too much time.
+                                    if block.warn_about_mem and not block.parent.warned_about_mem:
+                                        print('WARNING: Order of memory instructions not preserved due to long vector, errors possible')
+                                        block.parent.warned_about_mem = True
+                                    break
+
                                 for j in range(instr.args[12 * matmul_idx + 5]):
-                                    for k in range(instr.args[12 * matmul_idx + 4]):
-                                        first_factor_addr = first_base + \
-                                                            first_factor_row_length * first_factor_row_indices[i] + \
-                                                            first_factor_column_indices[k]
-                                        handle_mem_access(first_factor_addr, 's', last_mem_read_of, last_mem_write_of)
-
-                                        second_factor_addr = second_base + \
-                                                             second_factor_row_length * second_factor_row_indices[k] + \
-                                                             second_factor_column_indices[j]
-                                        handle_mem_access(second_factor_addr, 's', last_mem_read_of, last_mem_write_of)
+                                    second_factor_addr = second_base + \
+                                                         second_factor_row_length * second_factor_row_indices[k] + \
+                                                         second_factor_column_indices[j]
+                                    handle_mem_access(second_factor_addr, 's', last_mem_read_of, last_mem_write_of)
                     else:
                         # If the accessed values cannot be determined, be cautious I guess.
                         for i in last_mem_write_of.values():
diff --git a/Programs/Source/test_dot.mpc b/Programs/Source/test_dot.mpc
index 148caa0d5..92f0bad0c 100644
--- a/Programs/Source/test_dot.mpc
+++ b/Programs/Source/test_dot.mpc
@@ -158,3 +158,8 @@ M = sint.Matrix(10, 10)
 M.direct_mul(M, indices=[regint(0), regint.inc(10), regint.inc(10),
                          regint(0)])
 stop_timer(9)
+
+
+start_timer(10)
+sint.Matrix(1000, 1000) * sint.Matrix(1000, 1000)
+stop_timer(10)

From c7452697d80906011ee4aed4cd7d9b3854e491ef Mon Sep 17 00:00:00 2001
From: Vincent Ehrmanntraut <ehrmanntraut@itsec.rwth-aachen.de>
Date: Fri, 28 Jun 2024 09:44:11 +0200
Subject: [PATCH 4/4] Change MATMULSM allocator timeout from a time threshould
 to a size threshold

---
 Compiler/allocator.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/Compiler/allocator.py b/Compiler/allocator.py
index 05b6dbccd..2aaac2015 100644
--- a/Compiler/allocator.py
+++ b/Compiler/allocator.py
@@ -600,15 +600,24 @@ def keep_text_order(inst, n):
                             first_factor_row_length = instr.args[12 * matmul_idx + 10]
                             second_factor_row_length = instr.args[12 * matmul_idx + 11]
 
-                            # Add dependencies to the first factor.
-                            for i in range(instr.args[12 * matmul_idx + 3]):
-                                if (time.time() - start_time) > 10:
-                                    # Abort building the dependencies if that takes too much time.
-                                    if block.warn_about_mem and not block.parent.warned_about_mem:
-                                        print('WARNING: Order of memory instructions not preserved due to long vector, errors possible')
-                                        block.parent.warned_about_mem = True
-                                    break
+                            # Due to the potentially very large number of inputs on large matrices, adding dependencies to
+                            # all inputs may take a long time. Therefore, we only partially build the dependencies on
+                            # large matrices and output a warning.
+                            # The threshold of 2_250_000 values per matrix is equivalent to multiplying two 1500x1500
+                            # matrices. Experiments showed that multiplying two 1700x1700 matrices requires roughly 10 seconds on an i7-1370P,
+                            # so this threshold should lead to acceptable compile times even on slower processors.
+                            first_factor_total_number_of_values = instr.args[12 * matmul_idx + 3] * instr.args[12 * matmul_idx + 4]
+                            second_factor_total_number_of_values = instr.args[12 * matmul_idx + 4] * instr.args[12 * matmul_idx + 5]
+                            max_dependencies_per_matrix = 1500**2
+                            if first_factor_total_number_of_values > max_dependencies_per_matrix or second_factor_total_number_of_values > max_dependencies_per_matrix:
+                                if block.warn_about_mem and not block.parent.warned_about_mem:
+                                    print('WARNING: Order of memory instructions not preserved due to long vector, errors possible')
+                                    block.parent.warned_about_mem = True
 
+                            # Add dependencies to the first factor.
+                            # If the size of the matrix exceeds the max_dependencies_per_matrix, only a limited number
+                            # of rows will be processed.
+                            for i in range(min(instr.args[12 * matmul_idx + 3], max_dependencies_per_matrix // instr.args[12 * matmul_idx + 4] + 1)):
                                 for k in range(instr.args[12 * matmul_idx + 4]):
                                     first_factor_addr = first_base + \
                                                         first_factor_row_length * first_factor_row_indices[i] + \
@@ -616,7 +625,9 @@ def keep_text_order(inst, n):
                                     handle_mem_access(first_factor_addr, 's', last_mem_read_of, last_mem_write_of)
 
                             # Add dependencies to the second factor.
-                            for k in range(instr.args[12 * matmul_idx + 4]):
+                            # If the size of the matrix exceeds the max_dependencies_per_matrix, only a limited number
+                            # of rows will be processed.
+                            for k in range(min(instr.args[12 * matmul_idx + 4], max_dependencies_per_matrix // instr.args[12 * matmul_idx + 5] + 1)):
                                 if (time.time() - start_time) > 10:
                                     # Abort building the dependencies if that takes too much time.
                                     if block.warn_about_mem and not block.parent.warned_about_mem: