From 944184b2f8ecc356bc36405e3f3f6fdded411c1f Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 22 Jan 2024 22:44:46 +0100 Subject: [PATCH] refactor multi buf --- ggml-alloc.c | 183 +++++++++++++++++++------------------------- ggml-backend-impl.h | 15 +--- ggml-backend.c | 46 +++++++---- ggml-cuda.cu | 6 +- ggml-metal.m | 2 +- ggml-opencl.cpp | 4 +- ggml-vulkan.cpp | 2 +- 7 files changed, 118 insertions(+), 140 deletions(-) diff --git a/ggml-alloc.c b/ggml-alloc.c index faca771934799..c833ea9d1b8ab 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -776,136 +776,113 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) } // utils -ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { - GGML_ASSERT(ggml_get_no_alloc(ctx) == true); - - size_t alignment = ggml_backend_buft_get_alignment(buft); - size_t max_size = ggml_backend_buft_get_max_size(buft); - - size_t nbytes = 0; - for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - if (t->data == NULL && t->view_src == NULL) { - nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment); - } - } - - if (nbytes == 0) { - // all the tensors in the context are already allocated -#ifndef NDEBUG - fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__); -#endif - return NULL; - } - // single buffer allocation - if (nbytes <= max_size) { - ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes); - if (buffer == NULL) { - // failed to allocate buffer +static bool alloc_tensor_range(struct ggml_context * ctx, + struct ggml_tensor * first, struct ggml_tensor * last, + ggml_backend_buffer_type_t buft, size_t size, + ggml_backend_buffer_t ** buffers, size_t * n_buffers) { + ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size); + if (buffer == NULL) { #ifndef NDEBUG - fprintf(stderr, "%s: failed to allocate buffer\n", __func__); + fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size); #endif - return NULL; + for (size_t i = 0; i < *n_buffers; i++) { + ggml_backend_buffer_free(*buffers[i]); } + free(buffers); + return false; + } - ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer); + ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer); - for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - if (t->data == NULL) { - if (t->view_src == NULL) { - ggml_tallocr_alloc(tallocr, t); - } else { - ggml_backend_view_init(buffer, t); - } + for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) { + if (t->data == NULL) { + if (t->view_src == NULL) { + ggml_tallocr_alloc(tallocr, t); } else { - if (t->view_src != NULL) { - // view of a pre-allocated tensor - ggml_backend_view_init(buffer, t); - } + ggml_backend_view_init(buffer, t); } - } - - ggml_tallocr_free(tallocr); - - return buffer; - } - - // multi-buffer - size_t n_allocs = (nbytes - 1 + max_size) / max_size; - size_t * nbytes_per_alloc = (size_t *) malloc(n_allocs * sizeof(size_t)); - memset(nbytes_per_alloc, 0, n_allocs * sizeof(size_t)); - - // Calculate nbytes per alloc - size_t alloc_idx = 0; - for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - if (t->data == NULL && t->view_src == NULL) { - size_t tensor_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment); - if (nbytes_per_alloc[alloc_idx] + tensor_size > max_size) { - // Move to next allocation - alloc_idx += 1; + } else { + if (t->view_src != NULL) { + // view of a pre-allocated tensor + ggml_backend_view_init(buffer, t); } - nbytes_per_alloc[alloc_idx] += tensor_size; } } - ggml_backend_buffer_t multi_buffer = ggml_backend_multi_buffer_alloc_buffer(n_allocs, buft, nbytes); - ggml_backend_multi_buffer_context_t multi_ctx = (ggml_backend_multi_buffer_context_t) multi_buffer->context; + ggml_tallocr_free(tallocr); - size_t bytes_counter = 0; - struct ggml_tensor * current_tensor = ggml_get_first_tensor(ctx); + *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1)); + (*buffers)[(*n_buffers)++] = buffer; - for (alloc_idx = 0; alloc_idx < n_allocs; alloc_idx++) { - ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes_per_alloc[alloc_idx]); - if (buffer == NULL) { - // failed to allocate buffer -#ifndef NDEBUG - fprintf(stderr, "%s: failed to allocate buffer\n", __func__); -#endif - - // free previously allocated buffers - for (size_t dealloc_idx = 0; dealloc_idx < alloc_idx; dealloc_idx++) { - ggml_backend_buffer_free(multi_ctx->buffers[dealloc_idx]); - } - - free(nbytes_per_alloc); + return true; +} - return NULL; - } +ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { + GGML_ASSERT(ggml_get_no_alloc(ctx) == true); - multi_ctx->buffers[alloc_idx] = buffer; + size_t alignment = ggml_backend_buft_get_alignment(buft); + size_t max_size = ggml_backend_buft_get_max_size(buft); - ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer); + ggml_backend_buffer_t * buffers = NULL; + size_t n_buffers = 0; - for (; current_tensor != NULL; current_tensor = ggml_get_next_tensor(ctx, current_tensor)) { - size_t tensor_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, current_tensor), alignment); + size_t cur_buf_size = 0; + struct ggml_tensor * first = ggml_get_first_tensor(ctx); + for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) { + size_t this_size = 0; + if (t->data == NULL && t->view_src == NULL) { + this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment); + } - if (bytes_counter + tensor_size > max_size) { - // tensor uses next buffer - bytes_counter = 0; - break; + if (this_size > max_size) { + // tensor is too large to fit in a single buffer + fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n", + __func__, t->name, + ggml_backend_buft_name(buft), + this_size, max_size); + for (size_t i = 0; i < n_buffers; i++) { + ggml_backend_buffer_free(buffers[i]); } + free(buffers); + return NULL; + } - bytes_counter += tensor_size; - if (current_tensor->data == NULL) { - if (current_tensor->view_src == NULL) { - ggml_tallocr_alloc(tallocr, current_tensor); - } else { - ggml_backend_view_init(buffer, current_tensor); - } - } else { - if (current_tensor->view_src != NULL) { - // view of a pre-allocated tensor - ggml_backend_view_init(buffer, current_tensor); - } + if ((cur_buf_size + this_size) > max_size) { + // allocate tensors in the current buffer + if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) { + return NULL; } + first = t; + cur_buf_size = this_size; + } else { + cur_buf_size += this_size; } + } - ggml_tallocr_free(tallocr); + // allocate remaining tensors + if (cur_buf_size > 0) { + if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) { + return NULL; + } } - free(nbytes_per_alloc); + if (n_buffers == 0) { + // all the tensors in the context are already allocated +#ifndef NDEBUG + fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__); +#endif + return NULL; + } - return multi_buffer; + ggml_backend_buffer_t buffer; + if (n_buffers == 1) { + buffer = buffers[0]; + } else { + buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers); + } + free(buffers); + return buffer; } ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) { diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index f7919fe2776a9..b53b8877b9aee 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -64,19 +64,8 @@ extern "C" { // do not use directly, use ggml_backend_tensor_copy instead bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); - // multi-buffer - struct ggml_backend_multi_buffer_context { - ggml_backend_buffer_t * buffers; - size_t n_buffers; - }; - - typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t; - - GGML_CALL const char* ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer); - GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(size_t n_buffers, ggml_backend_buffer_type_t buft, size_t nbytes); - GGML_CALL void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer); - GGML_CALL void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value); - struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void); + // create a buffer that contains a collection of buffers + GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); // // Backend diff --git a/ggml-backend.c b/ggml-backend.c index af989dec85ddc..76b7e053931bf 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -28,11 +28,11 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) { } size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) { - // get_max_size is optional, defaults to UINT64_MAX + // get_max_size is optional, defaults to SIZE_MAX if (buft->iface.get_max_size) { return buft->iface.get_max_size(buft); } - return UINT64_MAX; + return SIZE_MAX; } GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { @@ -564,7 +564,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { /* .get_name = */ ggml_backend_cpu_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to UINT64_MAX + /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, @@ -620,7 +620,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to UINT64_MAX + /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, @@ -775,24 +775,22 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, v GGML_UNUSED(user_data); } - // multi-buffer buffer -GGML_CALL const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) { - ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context; +struct ggml_backend_multi_buffer_context { + ggml_backend_buffer_t * buffers; + size_t n_buffers; +}; - return ctx->buffers[0]->iface.get_name(ctx->buffers[0]); -} +typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t; -GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(size_t n_buffers, ggml_backend_buffer_type_t buft, size_t nbytes) { - ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context)); - ctx->n_buffers = n_buffers; - ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t)); +GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) { + ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context; - return ggml_backend_buffer_init(buft, ggml_backend_multi_buffer_context_interface(), ctx, nbytes); + return ctx->buffers[0]->iface.get_name(ctx->buffers[0]); } -GGML_CALL void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context; for (size_t i = 0; i < ctx->n_buffers; i++) { ggml_backend_buffer_free(ctx->buffers[i]); @@ -802,14 +800,14 @@ GGML_CALL void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffe free(ctx); } -GGML_CALL void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context; for (size_t i = 0; i < ctx->n_buffers; i++) { ggml_backend_buffer_clear(ctx->buffers[i], value); } } -struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) { +static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) { static struct ggml_backend_buffer_i multi_backend_buffer_i = { /* .get_name = */ ggml_backend_multi_buffer_get_name, /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer, @@ -825,6 +823,20 @@ struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) { return multi_backend_buffer_i; } +GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) { + ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context)); + ctx->n_buffers = n_buffers; + ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t)); + + size_t total_size = 0; + for (size_t i = 0; i < n_buffers; i++) { + ctx->buffers[i] = buffers[i]; + total_size += ggml_backend_buffer_get_size(buffers[i]); + } + + return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size); +} + // scheduler diff --git a/ggml-cuda.cu b/ggml-cuda.cu index b25c4a8fb071c..7be588d597eb0 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -10428,7 +10428,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .get_name = */ ggml_backend_cuda_buffer_type_name, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to UINT64_MAX + /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, /* .is_host = */ NULL, @@ -10704,7 +10704,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface /* .get_name = */ ggml_backend_cuda_split_buffer_type_name, /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to UINT64_MAX + /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size, /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend, /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, @@ -10784,7 +10784,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { /* .get_name = */ ggml_backend_cuda_host_buffer_type_name, /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, - /* .get_max_size = */ NULL, // defaults to UINT64_MAX + /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, diff --git a/ggml-metal.m b/ggml-metal.m index 35a077299de94..587907bdf56b7 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -2445,7 +2445,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { /* .get_name = */ ggml_backend_metal_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to UINT64_MAX + /* .get_max_size = */ NULL, // TODO: return device.maxBufferLength /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend, /* .is_host = */ ggml_backend_metal_buffer_type_is_host, diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index db56337ffe81b..1cfd6dc81694b 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -2055,7 +2055,7 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = { /* .get_name = */ ggml_backend_opencl_buffer_type_name, /* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to UINT64_MAX + /* .get_max_size = */ NULL, // TODO: return from device info /* .get_alloc_size = */ NULL, /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend, /* .is_host = */ NULL, @@ -2112,7 +2112,7 @@ ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() { /* .get_name = */ ggml_backend_opencl_host_buffer_type_name, /* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, - /* .get_max_size = */ NULL, // defaults to UINT64_MAX + /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 56381cb7742f6..408e3c3ba086c 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -4343,7 +4343,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { /* .get_name = */ ggml_backend_vk_host_buffer_type_name, /* .alloc_buffer = */ ggml_backend_vk_host_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to UINT64_MAX + /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,