Merge pull request #234 from nonhermitian/matvec

Vectorized versions of matvec routines used in iterative solver
Qiskit · Sep 30, 2024 · f06f6e2 · f06f6e2
2 parents b9952c7 + 0fa4f7b
commit f06f6e2
Show file tree

Hide file tree

Showing 5 changed files with 185 additions and 58 deletions.
diff --git a/mthree/matvec.pyx b/mthree/matvec.pyx
@@ -23,6 +23,11 @@ from cython.operator cimport dereference, postincrement
 
 from mthree.compute cimport within_distance, compute_element
 
+cdef extern from "src/distance.h" nogil:
+    unsigned int hamming_terms(unsigned int num_bits,
+                               unsigned int distance,
+                               unsigned int num_elems)
+
 
 cdef extern from "src/col_renorm.h" nogil:
     void compute_col_norms(float * col_norms,
@@ -33,6 +38,30 @@ cdef extern from "src/col_renorm.h" nogil:
                            unsigned int distance)
 
 
+cdef extern from "src/matvec.h" nogil:
+    void matvec(const float * x,
+                float * out,
+                float * col_norms,
+                const unsigned char * bitstrings,
+                const float * cals,
+                unsigned int num_bits,
+                unsigned int num_elems,
+                unsigned int distance,
+                int num_terms,
+                bool MAX_DIST)
+
+    void rmatvec(const float * x,
+                 float * out,
+                 float * col_norms,
+                 const unsigned char * bitstrings,
+                 const float * cals,
+                 unsigned int num_bits,
+                 unsigned int num_elems,
+                 unsigned int distance,
+                 int num_terms,
+                 bool MAX_DIST)
+
+
 
 cdef class M3MatVec():
     cdef unsigned char * bitstrings
@@ -43,6 +72,7 @@ cdef class M3MatVec():
     cdef public unsigned int num_bits
     cdef float * cals
     cdef public dict sorted_counts
+    cdef int num_terms
 
     def __cinit__(self, object counts, float[::1] cals, int distance=-1):
 
@@ -52,12 +82,15 @@ cdef class M3MatVec():
         self.num_bits = len(next(iter(counts)))
         self.cals = &cals[0]
         self.sorted_counts = counts_map
+        self.num_terms = -1
 
         if distance == -1:
             distance = self.num_bits
 
         self.distance = distance
         self.MAX_DIST = self.distance == self.num_bits
+        if not self.MAX_DIST:
+            self.num_terms = <int>hamming_terms(self.num_bits, self.distance, self.num_elems)
 
         self.bitstrings = <unsigned char *>malloc(self.num_bits*self.num_elems*sizeof(unsigned char))
         self.col_norms = <float *>malloc(self.num_elems*sizeof(float))
@@ -100,12 +133,16 @@ cdef class M3MatVec():
         if x.shape[0] != self.num_elems:
             raise Exception('Incorrect length of input vector.')
         cdef float[::1] out = np.empty(self.num_elems, dtype=np.float32)
-        with nogil:
-            for row in prange(self.num_elems, schedule='static'):
-                omp_matvec(row, &x[0], &out[0],
-                           self.bitstrings, self.col_norms, self.cals,
-                           self.num_elems, self.num_bits, self.distance,
-                           self.MAX_DIST)
+        matvec(&x[0],
+               &out[0],
+               self.col_norms,
+               self.bitstrings,
+               self.cals,
+               self.num_bits,
+               self.num_elems,
+               self.distance,
+               self.num_terms,
+               self.MAX_DIST)
         return np.asarray(out, dtype=np.float32)
 
     @cython.boundscheck(False)
@@ -114,12 +151,16 @@ cdef class M3MatVec():
         if x.shape[0] != self.num_elems:
             raise Exception('Incorrect length of input vector.')
         cdef float[::1] out = np.empty(self.num_elems, dtype=np.float32)
-        with nogil:
-            for col in prange(self.num_elems, schedule='static'):
-                omp_rmatvec(col, &x[0], &out[0],
-                            self.bitstrings, self.col_norms, self.cals,
-                            self.num_elems, self.num_bits, self.distance,
-                            self.MAX_DIST)
+        rmatvec(&x[0],
+                &out[0],
+                self.col_norms,
+                self.bitstrings,
+                self.cals,
+                self.num_bits,
+                self.num_elems,
+                self.distance,
+                self.num_terms,
+                self.MAX_DIST)
         return np.asarray(out, dtype=np.float32)
 
     def __dealloc__(self):
@@ -128,52 +169,6 @@ cdef class M3MatVec():
         if self.col_norms is not NULL:
             free(self.col_norms)
 
-@cython.boundscheck(False)
-@cython.cdivision(True)
-cdef void omp_matvec(size_t row,
-                     const float * x,
-                     float * out,
-                     const unsigned char * bitstrings,
-                     const float * col_norms,
-                     const float * cals,
-                     unsigned int num_elems,
-                     unsigned int num_bits,
-                     unsigned int distance,
-                     bool MAX_DIST) noexcept nogil:
-    cdef float temp_elem, row_sum = 0
-    cdef size_t col
-    for col in range(num_elems):
-        if MAX_DIST or within_distance(row, col, bitstrings,
-                                       num_bits, distance):
-            temp_elem = compute_element(row, col, bitstrings,
-                                        cals, num_bits)
-            temp_elem /= col_norms[col]
-            row_sum += temp_elem * x[col]
-    out[row] = row_sum
-
-@cython.boundscheck(False)
-@cython.cdivision(True)
-cdef void omp_rmatvec(size_t col,
-                      const float * x,
-                      float * out,
-                      const unsigned char * bitstrings,
-                      const float * col_norms,
-                      const float * cals,
-                      unsigned int num_elems,
-                      unsigned int num_bits,
-                      unsigned int distance,
-                      bool MAX_DIST) noexcept nogil:
-    cdef float temp_elem, row_sum = 0
-    cdef size_t row
-    for row in range(num_elems):
-        if MAX_DIST or within_distance(row, col, bitstrings,
-                                       num_bits, distance):
-            temp_elem = compute_element(row, col, bitstrings,
-                                        cals, num_bits)
-            temp_elem /= col_norms[col]
-            row_sum += temp_elem * x[row]
-    out[col] = row_sum
-
 
 @cython.boundscheck(False)
 @cython.cdivision(True)

diff --git a/mthree/src/col_renorm.h b/mthree/src/col_renorm.h
@@ -16,6 +16,7 @@ that they have been altered from the originals.
 #include "distance.h"
 #include "elements.h"
 
+#pragma once
 
 void compute_col_norms(float * col_norms,
                        const unsigned char * __restrict bitstrings,

diff --git a/mthree/src/distance.h b/mthree/src/distance.h
@@ -14,6 +14,7 @@ that they have been altered from the originals.
 #include <stddef.h>
 #include <stdbool.h>
 
+#pragma once
 
 static inline bool within_distance(unsigned int row,
                                    unsigned int col,

diff --git a/mthree/src/elements.h b/mthree/src/elements.h
@@ -13,6 +13,8 @@ that they have been altered from the originals.
 */
 #include <stddef.h>
 
+#pragma once
+
 static inline float compute_element(unsigned int row,
                                    unsigned int col,
                                    const unsigned char * __restrict bitstrings,

diff --git a/mthree/src/matvec.h b/mthree/src/matvec.h
@@ -0,0 +1,128 @@
+/*
+This code is part of Mthree.
+
+(C) Copyright IBM 2024.
+
+This code is licensed under the Apache License, Version 2.0. You may
+obtain a copy of this license in the LICENSE.txt file in the root directory
+of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+
+Any modifications or derivative works of this code must retain this
+copyright notice, and modified files need to carry a notice indicating
+that they have been altered from the originals.
+*/
+#include <stddef.h>
+#include <stdbool.h>
+#include "distance.h"
+#include "elements.h"
+
+#pragma once
+
+void matvec(const float * __restrict x,
+            float * out,
+            const float * __restrict col_norms,
+            const unsigned char * __restrict bitstrings,
+            const float * __restrict cals,
+            unsigned int num_bits,
+            unsigned int num_elems,
+            unsigned int distance,
+            int num_terms,
+            bool MAX_DIST)
+    /**
+   * @brief Action of reduced A-matrix on a vector
+   *
+   * @param x Pointer to input vector of data
+   * @param out Pointer to zeroed output vector
+   * @param col_norms Pointer to where to store col norm data
+   * @param bitstrings Pointer to array of bitstrings
+   * @param cals Pointer to array containing calibration data
+   * @param num_bits Number of bits in a single bit-string
+   * @param num_elems Number of elements (dimension) of reduced A-matrix
+   * @param distance Max Hamming distance
+   * @param num_terms Number of terms within hamming distance
+   * @param MAX_DIST Is the distance maximum (equal to number of bits)
+   */
+    {
+
+      size_t row;
+
+      #pragma omp parallel for
+      for (row = 0; row < num_elems; ++row)
+      {
+        float temp_elem, row_sum = 0;
+        size_t col;
+        bool flag = false;
+        int terms = 0;
+        for (col = 0; col < num_elems; ++col)
+        {
+          if (flag) continue;
+          if (MAX_DIST || within_distance(row, col, bitstrings, num_bits, distance))
+          {
+            temp_elem = compute_element(row, col, bitstrings, cals, num_bits);
+            temp_elem /= col_norms[col];
+            row_sum += temp_elem * x[col];
+            terms += 1;
+            if (terms == num_terms)
+            {
+              flag = true;
+            }
+          }
+        }
+        out[row] = row_sum;
+      }
+    }
+
+
+void rmatvec(const float * __restrict x,
+             float * out,
+             const float * __restrict col_norms,
+             const unsigned char * __restrict bitstrings,
+             const float * __restrict cals,
+             unsigned int num_bits,
+             unsigned int num_elems,
+             unsigned int distance,
+             int num_terms,
+             bool MAX_DIST)
+    /**
+   * @brief Action of adjoint of reduced A-matrix on a vector
+   *
+   * @param x Pointer to input vector of data
+   * @param out Pointer to zeroed output vector
+   * @param col_norms Pointer to where to store col norm data
+   * @param bitstrings Pointer to array of bitstrings
+   * @param cals Pointer to array containing calibration data
+   * @param num_bits Number of bits in a single bit-string
+   * @param num_elems Number of elements (dimension) of reduced A-matrix
+   * @param distance Max Hamming distance
+   * @param num_terms Number of terms within hamming distance
+   * @param MAX_DIST Is the distance maximum (equal to number of bits)
+   */
+    {
+
+      size_t col;
+
+      #pragma omp parallel for
+      for (col = 0; col < num_elems; ++col)
+      {
+        float temp_elem, row_sum = 0;
+        size_t row;
+        bool flag = false;
+        int terms = 0;
+        for (row = 0; row < num_elems; ++row)
+        {
+          if (flag) continue;
+          if (MAX_DIST || within_distance(row, col, bitstrings, num_bits, distance))
+          {
+            temp_elem = compute_element(row, col, bitstrings, cals, num_bits);
+            temp_elem /= col_norms[col];
+            row_sum += temp_elem * x[row];
+            terms += 1;
+            if (terms == num_terms)
+            {
+              flag = true;
+            }
+          }
+        }
+        out[col] = row_sum;
+      }
+    }