Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

basic matrix multiply example for C++ api? #17

Open
troyliu0105 opened this issue Sep 12, 2021 · 9 comments
Open

basic matrix multiply example for C++ api? #17

troyliu0105 opened this issue Sep 12, 2021 · 9 comments

Comments

@troyliu0105
Copy link

I couldn't find any c++ examples (including basic matrix multiply) here. Any related code snippet would be very helpful... 😢

@dumpinfo
Copy link

template<int M, bool NeedEncodeX>
void _run_bolt_matmul(const RowMatrix& X, const RowMatrix& Q,
ColMatrix centroids, RowMatrix<uint8_t> codes,
RowVector offsets, float scaleby,
ColMatrix<uint8_t> lut_out, RowMatrix<uint16_t> out)
{
auto ncols = X.cols();
auto nqueries = Q.rows();
auto nblocks = X.rows() / 32;
REQUIRE(ncols == Q.cols());
REQUIRE((nblocks * 32) == X.rows());

if (NeedEncodeX) {
    bolt_encode<M>(X.data(), X.rows(), (int)X.cols(), centroids.data(), codes.data());
}

for (int i = 0; i < nqueries; i++) {
    const float* q = Q.row(i).data();
    bolt_lut<M>(q, (int)ncols, centroids.data(), offsets.data(),
                scaleby, lut_out.data());
    bolt_scan<M, true>(codes.data(), lut_out.data(), out.row(i).data(), nblocks);
}

}
///#########################################################################
//##########################################################################
template
void _profile_bolt_matmul(int nrows, int ncols, int nqueries) {
static constexpr int ncodebooks = 2 * M;

// create random data
RowMatrix<float> X(nrows, ncols);
X.setRandom();

// create random queries
RowMatrix<float> Q(nqueries, ncols);
Q.setRandom();

// create random centroids
ColMatrix<float> centroids(ncentroids, ncols);
centroids.setRandom();

// create random codes in [0, 15]
ColMatrix<uint8_t> codes(nrows, ncodebooks);
codes.setRandom();
codes = codes.array() / 16;

// create random / arbitrary offsets and scale factor for luts
RowVector<float> offsets(ncols);
offsets.setRandom();
float scaleby = 3; // arbitrary number

// storage for luts, product
ColMatrix<uint8_t> luts(ncentroids, ncodebooks);
RowMatrix<uint16_t> out(nqueries, nrows);

// time it
std::string msg = string_with_format("bolt<%d> encode=%d matmul %d",
                                     M, false, nqueries);
REPEATED_PROFILE_DIST_COMPUTATION(kNreps, msg, kNtrials,
    out.data(), nrows * nqueries,
    (_run_bolt_matmul<M, false>(X, Q, centroids, codes, offsets, scaleby, luts, out)) );

std::string msg2 = string_with_format("bolt<%d> encode=%d matmul %d",
                                     M, true, nqueries);
REPEATED_PROFILE_DIST_COMPUTATION(kNreps, msg2, kNtrials,
    out.data(), nrows * nqueries,
    (_run_bolt_matmul<M, true>(X, Q, centroids, codes, offsets, scaleby, luts, out)) );

}
//################################################################

template
struct mithral_amm_task {
using traits = mithral_input_type_traits;
using scale_t = typename traits::encoding_scales_type;
using offset_t = typename traits::encoding_offsets_type;
using output_t = typename traits::output_type;
static constexpr int scan_block_nrows = 32;
static constexpr int ncentroids = 16;
static constexpr int nsplits_per_codebook = 4;
static constexpr int max_splitvals = 1 << 4;

mithral_amm_task(int N, int D, int M, int ncodebooks,
                 float lut_work_const):
    N_padded(N % scan_block_nrows == 0 ? N :
        N + (scan_block_nrows - (N % scan_block_nrows))),
    centroids(ncentroids * ncodebooks, D),
    nsplits(ncodebooks * nsplits_per_codebook),
    splitdims(nsplits),
    splitvals(max_splitvals, nsplits),
    encode_scales(nsplits),
    encode_offsets(nsplits),
    nnz_per_centroid(lut_work_const > 0 ?
        lut_work_const * D / ncodebooks : D),
    idxs(ncodebooks, nnz_per_centroid),
    amm(N_padded, D, M, ncodebooks, centroids.data(),
        splitdims.data(), splitvals.data(),
        encode_scales.data(), encode_offsets.data(),
        idxs.data(), nnz_per_centroid),
    X(N_padded, D),
    Q(D, M)
{
    centroids.setRandom();
    splitdims.setRandom();
    for (int i = 0; i < splitdims.size(); i++) {
        splitdims(i) = splitdims(i) % D;
    }
    splitvals.setRandom();
    encode_scales.setRandom();
    encode_offsets.setRandom();

    // randomly initialize idxs, ensuring all are unique and < D
    idxs.setRandom();
    int all_idxs[D];
    for (int i = 0; i < D; i++) {
        all_idxs[i] = i;
    }
    std::random_device rd;
    std::mt19937 g(rd());  // why can't shuffle just create its own...
    for (int c = 0; c < ncodebooks; c++) {  // random sequential idxs
        std::shuffle(all_idxs, all_idxs + D, g);
        std::sort(all_idxs, all_idxs + nnz_per_centroid);
        for (int j = 0; j < nnz_per_centroid; j++) {
            idxs(c, j) = all_idxs[j];
        }
    }

    X.setRandom();
    Q.setRandom();
}

void encode() { amm.encode(X.data()); }
void lut() { amm.lut(Q.data()); }
void scan() { amm.scan(); }

void run_matmul(bool create_lut=true) {
    encode();
    if (create_lut) {
        lut();
    }
    scan();
}

const ColMatrix<output_t>& output() const { return amm.out_mat; }

// stuff we pass into the amm object (would be learned during training)
int N_padded;
ColMatrix<float> centroids;
int nsplits;
RowVector<uint32_t> splitdims;
ColMatrix<int8_t> splitvals;
RowVector<scale_t> encode_scales;
RowVector<offset_t> encode_offsets;
int nnz_per_centroid;
RowMatrix<int> idxs;

// amm object
mithral_amm<InputT> amm;

// random data
ColMatrix<InputT> X;
ColMatrix<float> Q;

};
//#################################################################
void run_matmul(bool create_lut=true) {
encode();
if (create_lut) {
lut();
}
scan();
}

@troyliu0105
Copy link
Author

@dumpinfo Thx for your reply. I'm going to test it. 😆

@a3413209
Copy link

Hello, how to use this test case, I have compiled the source file:bazel run :main,What should I do afterwards, can I tell you more about it?

@dblalock
Copy link
Owner

To run any of the MADDNESS code, use cpp/test/main.cpp. I used catch tests for everything because it gives you a nice CLI and makes debugging easy.

You can run it with no arguments, but you probably want to pass some tags for which tests to run--the full suite takes a while. You probably want something like [amm] ~[old] to just run the MADDNESS tests and exclude legacy / debugging tests.

@fjrdev
Copy link

fjrdev commented Apr 20, 2022

Hi @dumpinfo, @troyliu0105, it seems like you have worked with mithral in its C++ implementation. Could you find files where the parameter optimization takes place? Running run_matmul()in mithral_amm_task() obviously does not work, since the parameters necessary for the encoding and lut-creation are set randomly. Or does one have to provide the learning-phase?

@dblalock
Copy link
Owner

@fjrdev: there is no learning available in the C++ API, only the python implementation. So you'd have to generate the split thresholds, split values, and prototypes in python and pass them to C++. Which this repo doesn't support doing yet (and I likely won't add it in the foreseeable future because I wrote this code 2.5y ago and am super busy with work + family these days).

@dumpinfo
Copy link

implemented in "python/clusterize.py"

@dumpinfo
Copy link

image

@mpetri
Copy link

mpetri commented Jun 21, 2022

@dumpinfo have you managed to port the python learning code to c++ or at least were able to extract the learnt parameters into the c++ codebase?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

6 participants