From c2658c3ae8c077a70715c40f2ea5c2bfb0c22da6 Mon Sep 17 00:00:00 2001 From: S Date: Sat, 6 Apr 2024 11:23:38 +0100 Subject: [PATCH] Fix unexpected tokens on MPS. Re-add F16 fix. ((Noeda) --- llama.cpp | 350 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 346 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 53cf86dc02ef8..a8c6568c020a0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5734,6 +5734,340 @@ static void llm_build_kv_store( ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view)); } +static struct ggml_tensor * llama_build_mat_mul_blocked_computation( + /* + * Does (almost) same thing as ggml_mat_mul mathematically speaking, + * but splits the computation into chunks. + * + * Why would you want to do this? As part of Command-R+ coding, we + * discovered that quite a bit of the GPU code is not prepared for + * matrices with more than 2**31-1 elements (~2 billion). + * + * Some context: + * https://github.com/ggerganov/llama.cpp/pull/6491 + * + * This function has a limit (set to 2B) that if any constituent parts + * of it (input, output, result) would go over that limit byte-wise, + * it'll use the splitted computation. This is based on the idea that + * this minimizes the chance that somewhere downstream in GPU code, be + * it MPS or Cuda, has something like: int x = y * z; where the values + * of y and z overflow the multiplication and then silently (or not so + * silently) does something weird. At the time of writing (2024-04-05); + * it seems that CUDA code outright crashes and MPS silently gives bad + * results. + * + * This is a band-aid workaround. The ideal state of the world is that + * this function does nothing but "return ggml_mat_mul(ctx, a, b)". + * + * The last argument (forced_block_size) is for debugging. You can + * force a certain block size to use with the computation. If zero + * (default) then the block size is determined on the fly. Production + * code should always have it zero; and only set it to a non-zero value + * for debugging and testing. + */ + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const llama_model & model, + const llm_build_cb & cb, + int64_t il, + size_t forced_block_size) +{ + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + if (forced_block_size != 0) { + //fprintf(stderr, "warning: llama_build_mat_mul_blocked_computation() forced block size: %zu\n", forced_block_size); + } + + const size_t MAX_BYTES_BEFORE_SPLIT = 2000000000; + + // the actual ggml_mul_mat supports batching. But this one doesn't. + GGML_ASSERT(a->ne[2] == 1 && b->ne[2] == 1); + GGML_ASSERT(a->ne[3] == 1 && b->ne[3] == 1); + + // bail out if if the number of elements would be zero. + // nicer than getting a segfault. + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + GGML_ASSERT(a->ne[i] > 0 && "Matrix multiplication with a 0-side length matrix ('a')."); + GGML_ASSERT(b->ne[i] > 0 && "Matrix multiplication with a 0-side length matrix ('b')."); + } + + // Use the max size of: a, b, result size + const size_t a_rows = a->ne[1]; + const size_t a_cols = a->ne[0]; + + // b is transposed + const size_t b_rows = b->ne[0]; + const size_t b_cols = b->ne[1]; + + const size_t c_rows = a_rows; + const size_t c_cols = b_cols; + + // determine a size of a block that's as big as possible. + // we start with block size of the maximum size, and if that passes, + // then we just use ggml_mat_mul() + // + // the block is square. + size_t cand_block_size = a_rows; + if (a_cols > cand_block_size) { cand_block_size = a_cols; } + if (b_rows > cand_block_size) { cand_block_size = b_rows; } + if (b_cols > cand_block_size) { cand_block_size = b_cols; } + if (c_rows > cand_block_size) { cand_block_size = c_rows; } + if (c_cols > cand_block_size) { cand_block_size = c_cols; } + + size_t block_size = 1; + while (block_size < cand_block_size) { + block_size <<= 1; + } + + if (forced_block_size != 0) { + block_size = forced_block_size; + } else { + // figure out what is largest block_size we can use that will never + // have an intermediate result bigger than + // MAX_BYTES_BEFORE_SPLIT + bool ok = true; + while (block_size > 0) { + ok = true; + + // keep the byte calculations in sync with the blocked code in + // the computation part. + + // Criteria: + // 1. result block size + { + const size_t i_min = 0; + const size_t j_min = 0; + size_t i_max = i_min + block_size; + size_t j_max = j_min + block_size; + if (i_max > a_rows) { i_max = a_rows; } + if (j_max > b_cols) { j_max = b_cols; } + + const size_t bytes_size = sizeof(float) * (i_max - i_min) * (j_max - j_min); + if (bytes_size > MAX_BYTES_BEFORE_SPLIT) { + ok = false; + } + } + // 2. and 3. + // Block size from 'a' and 'b' + { + const size_t i_min = 0; + const size_t j_min = 0; + const size_t k_min = 0; + + size_t i_max = i_min + block_size; + size_t j_max = j_min + block_size; + size_t k_max = k_min + block_size; + + if (i_max > a_rows) { i_max = a_rows; } + if (j_max > b_cols) { j_max = b_cols; } + if (k_max > a_cols) { k_max = a_cols; } + + const size_t bytes_size_a = sizeof(float) * (k_max - k_min) * (i_max - i_min); + const size_t bytes_size_b = sizeof(float) * (k_max - k_min) * (j_max - j_min); + + if (bytes_size_a > MAX_BYTES_BEFORE_SPLIT || bytes_size_b > MAX_BYTES_BEFORE_SPLIT) { + ok = false; + } + } + + if (!ok) { + block_size /= 2; + continue; + } + break; + } + GGML_ASSERT(block_size > 0); + } + + //fprintf(stderr, "block_size=%zu a shape: %d %d b shape: %d %d\n", block_size, a_rows, a_cols, b_rows, b_cols); + + // O(N^3) nested loop, where N is number of blocks on one of the + // constituent parts. + size_t nb_A = (a_rows + block_size - 1) / block_size; + size_t nb_B = (b_cols + block_size - 1) / block_size; + size_t nb_A2 = (a_cols + block_size - 1) / block_size; + + // make placeholder tensors for each block results. + // 2D: (row, col) -> offset is: (x, y) -> x * nb_B + y + struct ggml_tensor ** result_blocks = (struct ggml_tensor **) malloc(nb_A * nb_B * sizeof(struct ggml_tensor *)); + + for (size_t i = 0; i < nb_A; ++i) { + for (size_t j = 0; j < nb_B; ++j) { + const size_t i_min = i * block_size; + const size_t j_min = j * block_size; + size_t i_max = i_min + block_size; + size_t j_max = j_min + block_size; + + if (i_max > a_rows) { i_max = a_rows; } + if (j_max > b_cols) { j_max = b_cols; } + + struct ggml_tensor * result_block = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, i_max - i_min, j_max - j_min); + result_block = ggml_scale(ctx, result_block, 0.0f); + + cb(result_block, "result_block-fresh", il); + result_blocks[i * nb_B + j] = result_block; + } + } + + size_t num_blocks = 0; + for (size_t i = 0; i < nb_A; ++i) { + for (size_t j = 0; j < nb_B; ++j) { + for (size_t k = 0; k < nb_A2; ++k) { + num_blocks++; + + const size_t i_min = i * block_size; + const size_t j_min = j * block_size; + const size_t k_min = k * block_size; + + size_t i_max = i_min + block_size; + size_t j_max = j_min + block_size; + size_t k_max = k_min + block_size; + if (i_max > a_rows) { i_max = a_rows; } + if (j_max > b_cols) { j_max = b_cols; } + if (k_max > a_cols) { k_max = a_cols; } + + const size_t blck_size_a = (const size_t) ggml_blck_size(a->type); + const size_t blck_size_b = (const size_t) ggml_blck_size(b->type); + const size_t type_size_a = ggml_type_size(a->type); + const size_t type_size_b = ggml_type_size(b->type); + + GGML_ASSERT(k_min * type_size_a % blck_size_a == 0); + GGML_ASSERT(k_min * type_size_b % blck_size_b == 0); + + // blck_size=32 + // type_size_a=19 + // + // k_min = 4 + // + // byte_offset = (type_size_a * (k_min/blck_size)) = + // 19 * (4/32) = 2 + + struct ggml_tensor * a_slice = ggml_view_2d( + ctx, a, + k_max - k_min, // k:k_max size + i_max - i_min, // i:i_max size + ggml_row_size(a->type, a->ne[0]), + ggml_row_size(a->type, a->ne[0]) * i_min + k_min * type_size_a / blck_size_a); + + cb(a_slice, "a_slice", il); + + struct ggml_tensor * b_slice = ggml_view_2d( + ctx, b, + k_max - k_min, // k:k_max size + j_max - j_min, // j:j_max size + ggml_row_size(b->type, b->ne[0]), + ggml_row_size(b->type, b->ne[0]) * j_min + k_min * type_size_b / blck_size_b); + + cb(b_slice, "b_slice", il); + + struct ggml_tensor * result_slice = result_blocks[i * nb_B + j]; + + struct ggml_tensor * mm_result = ggml_mul_mat(ctx, a_slice, b_slice); + cb(mm_result, "mm_result", il); + + result_blocks[i * nb_B + j] = ggml_add(ctx, result_slice, mm_result); + cb(result_blocks[i * nb_B + j], "result_slice", il); + } + } + } + + // concate the results into one chonky tensor. + // ggml_concat goes mad if the first two dimensions are not the same. + // + // We use this strategy: find largest power of two that divides the + // size of all the tensors. Power of two to make it friendly to GPU + // code; (TODO: LCD might be better? but not sure it won't break code). + // + // Flatten all the tensors to (X, 1, N, 1). + size_t split_size = 1; + while (1) { + size_t candidate_split_size = split_size << 1; + bool bad = false; + + for (size_t i = 0; i < nb_A * nb_B; ++i) { + size_t rows = result_blocks[i]->ne[0]; + size_t cols = result_blocks[i]->ne[1]; + + if (candidate_split_size > rows * cols) { + bad = true; + break; + } + + if ((rows * cols) % candidate_split_size != 0) { + bad = true; + break; + } + } + + if (bad) { + break; + } + + split_size = candidate_split_size; + } + + struct ggml_tensor * result_final = nullptr; + const ggml_type wanted_final_type = a->type; + + // TODO: looks like concat also wants f32, so everything is casted to + // f32 here.. A datatype-agnostic concat would be nice; or ability to + // do the tensor equivalent of unsafe type cast. + // + // The Command-R+ tensor this code was written for was 6GB. So this is + // going to handle 12GB I guess. Oof. + // + // I believe you could be smarter and combine hierarchially instead of + // one by one. I.e. we are doing a concetenation like this: + // for x in range(100): + // accum = accum + [x] (copies accum every time? maybe. didn't read concat code) + // + // You could instead divide and conquer to make it a bit smarter. + for (size_t i = 0; i < nb_A; ++i) { + for (size_t j = 0; j < nb_B; ++j) { + struct ggml_tensor * src_block = result_blocks[i * nb_B + j]; + + const size_t rows = src_block->ne[0]; + const size_t cols = src_block->ne[1]; + GGML_ASSERT(rows * cols % split_size == 0); + + const size_t nflattened_rows = split_size; + const size_t n3 = (rows * cols) / split_size; + + src_block = ggml_view_3d(ctx, src_block, + nflattened_rows, + 1, + n3, + nflattened_rows * ggml_element_size(src_block), + nflattened_rows * ggml_element_size(src_block), + 0); + + if (result_final == nullptr) { + if (src_block->type != GGML_TYPE_F32) { + result_final = ggml_cast(ctx, src_block, GGML_TYPE_F32); + cb(result_final, "result-upcast", il); + } else { + result_final = src_block; + } + continue; + } + + if (src_block->type != GGML_TYPE_F32) { + src_block = ggml_cast(ctx, src_block, GGML_TYPE_F32); + } + result_final = ggml_concat(ctx, result_final, src_block); + cb(result_final, "result_final-accumulator", il); + } + } + + result_final = ggml_reshape_2d(ctx, result_final, c_rows, c_cols); + cb(result_final, "result_final", il); + + free(result_blocks); + + return result_final; +} + static struct ggml_tensor * llm_build_norm( struct ggml_context * ctx, struct ggml_tensor * cur, @@ -6479,7 +6813,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llama_build_mat_mul_blocked_computation(ctx0, model.output, cur, model, cb, -1, 0); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -9462,7 +9796,15 @@ struct llm_build_context { if (model.layers[il].attn_q_norm) { - + struct ggml_tensor * attn_q_norm = model.layers[il].attn_q_norm; + struct ggml_tensor * attn_k_norm = model.layers[il].attn_k_norm; + + // CPU did not like F16, so cast to F32 + attn_q_norm = ggml_cast(ctx0, attn_q_norm, GGML_TYPE_F32); + cb(attn_q_norm, "attn_q_norm_cast_F32", il); + attn_k_norm = ggml_cast(ctx0, attn_k_norm, GGML_TYPE_F32); + cb(attn_k_norm, "attn_k_norm_cast_F32", il); + Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens, ggml_element_size(Qcur) * n_embd_head, ggml_element_size(Qcur) * n_embd_head * n_head, @@ -9475,13 +9817,13 @@ struct llm_build_context { cb(Kcur, "Kcur", il); Qcur = llm_build_norm(ctx0, Qcur, hparams, - model.layers[il].attn_q_norm, + attn_q_norm, NULL, LLM_NORM, cb, il); cb(Qcur, "Qcur", il); Kcur = llm_build_norm(ctx0, Kcur, hparams, - model.layers[il].attn_k_norm, + attn_k_norm, NULL, LLM_NORM, cb, il); cb(Kcur, "Kcur", il);