Skip to content

Commit

Permalink
Merge pull request #234 from nonhermitian/matvec
Browse files Browse the repository at this point in the history
Vectorized versions of matvec routines used in iterative solver
  • Loading branch information
nonhermitian authored Sep 30, 2024
2 parents b9952c7 + 0fa4f7b commit f06f6e2
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 58 deletions.
111 changes: 53 additions & 58 deletions mthree/matvec.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ from cython.operator cimport dereference, postincrement

from mthree.compute cimport within_distance, compute_element

cdef extern from "src/distance.h" nogil:
unsigned int hamming_terms(unsigned int num_bits,
unsigned int distance,
unsigned int num_elems)


cdef extern from "src/col_renorm.h" nogil:
void compute_col_norms(float * col_norms,
Expand All @@ -33,6 +38,30 @@ cdef extern from "src/col_renorm.h" nogil:
unsigned int distance)


cdef extern from "src/matvec.h" nogil:
void matvec(const float * x,
float * out,
float * col_norms,
const unsigned char * bitstrings,
const float * cals,
unsigned int num_bits,
unsigned int num_elems,
unsigned int distance,
int num_terms,
bool MAX_DIST)

void rmatvec(const float * x,
float * out,
float * col_norms,
const unsigned char * bitstrings,
const float * cals,
unsigned int num_bits,
unsigned int num_elems,
unsigned int distance,
int num_terms,
bool MAX_DIST)



cdef class M3MatVec():
cdef unsigned char * bitstrings
Expand All @@ -43,6 +72,7 @@ cdef class M3MatVec():
cdef public unsigned int num_bits
cdef float * cals
cdef public dict sorted_counts
cdef int num_terms

def __cinit__(self, object counts, float[::1] cals, int distance=-1):

Expand All @@ -52,12 +82,15 @@ cdef class M3MatVec():
self.num_bits = len(next(iter(counts)))
self.cals = &cals[0]
self.sorted_counts = counts_map
self.num_terms = -1

if distance == -1:
distance = self.num_bits

self.distance = distance
self.MAX_DIST = self.distance == self.num_bits
if not self.MAX_DIST:
self.num_terms = <int>hamming_terms(self.num_bits, self.distance, self.num_elems)

self.bitstrings = <unsigned char *>malloc(self.num_bits*self.num_elems*sizeof(unsigned char))
self.col_norms = <float *>malloc(self.num_elems*sizeof(float))
Expand Down Expand Up @@ -100,12 +133,16 @@ cdef class M3MatVec():
if x.shape[0] != self.num_elems:
raise Exception('Incorrect length of input vector.')
cdef float[::1] out = np.empty(self.num_elems, dtype=np.float32)
with nogil:
for row in prange(self.num_elems, schedule='static'):
omp_matvec(row, &x[0], &out[0],
self.bitstrings, self.col_norms, self.cals,
self.num_elems, self.num_bits, self.distance,
self.MAX_DIST)
matvec(&x[0],
&out[0],
self.col_norms,
self.bitstrings,
self.cals,
self.num_bits,
self.num_elems,
self.distance,
self.num_terms,
self.MAX_DIST)
return np.asarray(out, dtype=np.float32)

@cython.boundscheck(False)
Expand All @@ -114,12 +151,16 @@ cdef class M3MatVec():
if x.shape[0] != self.num_elems:
raise Exception('Incorrect length of input vector.')
cdef float[::1] out = np.empty(self.num_elems, dtype=np.float32)
with nogil:
for col in prange(self.num_elems, schedule='static'):
omp_rmatvec(col, &x[0], &out[0],
self.bitstrings, self.col_norms, self.cals,
self.num_elems, self.num_bits, self.distance,
self.MAX_DIST)
rmatvec(&x[0],
&out[0],
self.col_norms,
self.bitstrings,
self.cals,
self.num_bits,
self.num_elems,
self.distance,
self.num_terms,
self.MAX_DIST)
return np.asarray(out, dtype=np.float32)

def __dealloc__(self):
Expand All @@ -128,52 +169,6 @@ cdef class M3MatVec():
if self.col_norms is not NULL:
free(self.col_norms)

@cython.boundscheck(False)
@cython.cdivision(True)
cdef void omp_matvec(size_t row,
const float * x,
float * out,
const unsigned char * bitstrings,
const float * col_norms,
const float * cals,
unsigned int num_elems,
unsigned int num_bits,
unsigned int distance,
bool MAX_DIST) noexcept nogil:
cdef float temp_elem, row_sum = 0
cdef size_t col
for col in range(num_elems):
if MAX_DIST or within_distance(row, col, bitstrings,
num_bits, distance):
temp_elem = compute_element(row, col, bitstrings,
cals, num_bits)
temp_elem /= col_norms[col]
row_sum += temp_elem * x[col]
out[row] = row_sum

@cython.boundscheck(False)
@cython.cdivision(True)
cdef void omp_rmatvec(size_t col,
const float * x,
float * out,
const unsigned char * bitstrings,
const float * col_norms,
const float * cals,
unsigned int num_elems,
unsigned int num_bits,
unsigned int distance,
bool MAX_DIST) noexcept nogil:
cdef float temp_elem, row_sum = 0
cdef size_t row
for row in range(num_elems):
if MAX_DIST or within_distance(row, col, bitstrings,
num_bits, distance):
temp_elem = compute_element(row, col, bitstrings,
cals, num_bits)
temp_elem /= col_norms[col]
row_sum += temp_elem * x[row]
out[col] = row_sum


@cython.boundscheck(False)
@cython.cdivision(True)
Expand Down
1 change: 1 addition & 0 deletions mthree/src/col_renorm.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ that they have been altered from the originals.
#include "distance.h"
#include "elements.h"

#pragma once

void compute_col_norms(float * col_norms,
const unsigned char * __restrict bitstrings,
Expand Down
1 change: 1 addition & 0 deletions mthree/src/distance.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ that they have been altered from the originals.
#include <stddef.h>
#include <stdbool.h>

#pragma once

static inline bool within_distance(unsigned int row,
unsigned int col,
Expand Down
2 changes: 2 additions & 0 deletions mthree/src/elements.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ that they have been altered from the originals.
*/
#include <stddef.h>

#pragma once

static inline float compute_element(unsigned int row,
unsigned int col,
const unsigned char * __restrict bitstrings,
Expand Down
128 changes: 128 additions & 0 deletions mthree/src/matvec.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
/*
This code is part of Mthree.
(C) Copyright IBM 2024.
This code is licensed under the Apache License, Version 2.0. You may
obtain a copy of this license in the LICENSE.txt file in the root directory
of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
Any modifications or derivative works of this code must retain this
copyright notice, and modified files need to carry a notice indicating
that they have been altered from the originals.
*/
#include <stddef.h>
#include <stdbool.h>
#include "distance.h"
#include "elements.h"

#pragma once

void matvec(const float * __restrict x,
float * out,
const float * __restrict col_norms,
const unsigned char * __restrict bitstrings,
const float * __restrict cals,
unsigned int num_bits,
unsigned int num_elems,
unsigned int distance,
int num_terms,
bool MAX_DIST)
/**
* @brief Action of reduced A-matrix on a vector
*
* @param x Pointer to input vector of data
* @param out Pointer to zeroed output vector
* @param col_norms Pointer to where to store col norm data
* @param bitstrings Pointer to array of bitstrings
* @param cals Pointer to array containing calibration data
* @param num_bits Number of bits in a single bit-string
* @param num_elems Number of elements (dimension) of reduced A-matrix
* @param distance Max Hamming distance
* @param num_terms Number of terms within hamming distance
* @param MAX_DIST Is the distance maximum (equal to number of bits)
*/
{

size_t row;

#pragma omp parallel for
for (row = 0; row < num_elems; ++row)
{
float temp_elem, row_sum = 0;
size_t col;
bool flag = false;
int terms = 0;
for (col = 0; col < num_elems; ++col)
{
if (flag) continue;
if (MAX_DIST || within_distance(row, col, bitstrings, num_bits, distance))
{
temp_elem = compute_element(row, col, bitstrings, cals, num_bits);
temp_elem /= col_norms[col];
row_sum += temp_elem * x[col];
terms += 1;
if (terms == num_terms)
{
flag = true;
}
}
}
out[row] = row_sum;
}
}


void rmatvec(const float * __restrict x,
float * out,
const float * __restrict col_norms,
const unsigned char * __restrict bitstrings,
const float * __restrict cals,
unsigned int num_bits,
unsigned int num_elems,
unsigned int distance,
int num_terms,
bool MAX_DIST)
/**
* @brief Action of adjoint of reduced A-matrix on a vector
*
* @param x Pointer to input vector of data
* @param out Pointer to zeroed output vector
* @param col_norms Pointer to where to store col norm data
* @param bitstrings Pointer to array of bitstrings
* @param cals Pointer to array containing calibration data
* @param num_bits Number of bits in a single bit-string
* @param num_elems Number of elements (dimension) of reduced A-matrix
* @param distance Max Hamming distance
* @param num_terms Number of terms within hamming distance
* @param MAX_DIST Is the distance maximum (equal to number of bits)
*/
{

size_t col;

#pragma omp parallel for
for (col = 0; col < num_elems; ++col)
{
float temp_elem, row_sum = 0;
size_t row;
bool flag = false;
int terms = 0;
for (row = 0; row < num_elems; ++row)
{
if (flag) continue;
if (MAX_DIST || within_distance(row, col, bitstrings, num_bits, distance))
{
temp_elem = compute_element(row, col, bitstrings, cals, num_bits);
temp_elem /= col_norms[col];
row_sum += temp_elem * x[row];
terms += 1;
if (terms == num_terms)
{
flag = true;
}
}
}
out[col] = row_sum;
}
}

0 comments on commit f06f6e2

Please sign in to comment.