Skip to content

Commit

Permalink
refactor multi buf
Browse files Browse the repository at this point in the history
  • Loading branch information
slaren committed Jan 22, 2024
1 parent bcf2a44 commit 944184b
Show file tree
Hide file tree
Showing 7 changed files with 118 additions and 140 deletions.
183 changes: 80 additions & 103 deletions ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -776,136 +776,113 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph)
}

// utils
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);

size_t alignment = ggml_backend_buft_get_alignment(buft);
size_t max_size = ggml_backend_buft_get_max_size(buft);

size_t nbytes = 0;
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL && t->view_src == NULL) {
nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
}
}

if (nbytes == 0) {
// all the tensors in the context are already allocated
#ifndef NDEBUG
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
#endif
return NULL;
}

// single buffer allocation
if (nbytes <= max_size) {
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
if (buffer == NULL) {
// failed to allocate buffer
static bool alloc_tensor_range(struct ggml_context * ctx,
struct ggml_tensor * first, struct ggml_tensor * last,
ggml_backend_buffer_type_t buft, size_t size,
ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
if (buffer == NULL) {
#ifndef NDEBUG
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
#endif
return NULL;
for (size_t i = 0; i < *n_buffers; i++) {
ggml_backend_buffer_free(*buffers[i]);
}
free(buffers);
return false;
}

ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);

for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) {
if (t->view_src == NULL) {
ggml_tallocr_alloc(tallocr, t);
} else {
ggml_backend_view_init(buffer, t);
}
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) {
if (t->view_src == NULL) {
ggml_tallocr_alloc(tallocr, t);
} else {
if (t->view_src != NULL) {
// view of a pre-allocated tensor
ggml_backend_view_init(buffer, t);
}
ggml_backend_view_init(buffer, t);
}
}

ggml_tallocr_free(tallocr);

return buffer;
}

// multi-buffer
size_t n_allocs = (nbytes - 1 + max_size) / max_size;
size_t * nbytes_per_alloc = (size_t *) malloc(n_allocs * sizeof(size_t));
memset(nbytes_per_alloc, 0, n_allocs * sizeof(size_t));

// Calculate nbytes per alloc
size_t alloc_idx = 0;
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL && t->view_src == NULL) {
size_t tensor_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
if (nbytes_per_alloc[alloc_idx] + tensor_size > max_size) {
// Move to next allocation
alloc_idx += 1;
} else {
if (t->view_src != NULL) {
// view of a pre-allocated tensor
ggml_backend_view_init(buffer, t);
}
nbytes_per_alloc[alloc_idx] += tensor_size;
}
}

ggml_backend_buffer_t multi_buffer = ggml_backend_multi_buffer_alloc_buffer(n_allocs, buft, nbytes);
ggml_backend_multi_buffer_context_t multi_ctx = (ggml_backend_multi_buffer_context_t) multi_buffer->context;
ggml_tallocr_free(tallocr);

size_t bytes_counter = 0;
struct ggml_tensor * current_tensor = ggml_get_first_tensor(ctx);
*buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
(*buffers)[(*n_buffers)++] = buffer;

for (alloc_idx = 0; alloc_idx < n_allocs; alloc_idx++) {
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes_per_alloc[alloc_idx]);
if (buffer == NULL) {
// failed to allocate buffer
#ifndef NDEBUG
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
#endif

// free previously allocated buffers
for (size_t dealloc_idx = 0; dealloc_idx < alloc_idx; dealloc_idx++) {
ggml_backend_buffer_free(multi_ctx->buffers[dealloc_idx]);
}

free(nbytes_per_alloc);
return true;
}

return NULL;
}
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);

multi_ctx->buffers[alloc_idx] = buffer;
size_t alignment = ggml_backend_buft_get_alignment(buft);
size_t max_size = ggml_backend_buft_get_max_size(buft);

ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
ggml_backend_buffer_t * buffers = NULL;
size_t n_buffers = 0;

for (; current_tensor != NULL; current_tensor = ggml_get_next_tensor(ctx, current_tensor)) {
size_t tensor_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, current_tensor), alignment);
size_t cur_buf_size = 0;
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
size_t this_size = 0;
if (t->data == NULL && t->view_src == NULL) {
this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
}

if (bytes_counter + tensor_size > max_size) {
// tensor uses next buffer
bytes_counter = 0;
break;
if (this_size > max_size) {
// tensor is too large to fit in a single buffer
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
__func__, t->name,
ggml_backend_buft_name(buft),
this_size, max_size);
for (size_t i = 0; i < n_buffers; i++) {
ggml_backend_buffer_free(buffers[i]);
}
free(buffers);
return NULL;
}

bytes_counter += tensor_size;
if (current_tensor->data == NULL) {
if (current_tensor->view_src == NULL) {
ggml_tallocr_alloc(tallocr, current_tensor);
} else {
ggml_backend_view_init(buffer, current_tensor);
}
} else {
if (current_tensor->view_src != NULL) {
// view of a pre-allocated tensor
ggml_backend_view_init(buffer, current_tensor);
}
if ((cur_buf_size + this_size) > max_size) {
// allocate tensors in the current buffer
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
return NULL;
}
first = t;
cur_buf_size = this_size;
} else {
cur_buf_size += this_size;
}
}

ggml_tallocr_free(tallocr);
// allocate remaining tensors
if (cur_buf_size > 0) {
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
return NULL;
}
}

free(nbytes_per_alloc);
if (n_buffers == 0) {
// all the tensors in the context are already allocated
#ifndef NDEBUG
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
#endif
return NULL;
}

return multi_buffer;
ggml_backend_buffer_t buffer;
if (n_buffers == 1) {
buffer = buffers[0];
} else {
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
}
free(buffers);
return buffer;
}

ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
Expand Down
15 changes: 2 additions & 13 deletions ggml-backend-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,19 +64,8 @@ extern "C" {
// do not use directly, use ggml_backend_tensor_copy instead
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);

// multi-buffer
struct ggml_backend_multi_buffer_context {
ggml_backend_buffer_t * buffers;
size_t n_buffers;
};

typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;

GGML_CALL const char* ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer);
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(size_t n_buffers, ggml_backend_buffer_type_t buft, size_t nbytes);
GGML_CALL void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer);
GGML_CALL void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value);
struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void);
// create a buffer that contains a collection of buffers
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);

//
// Backend
Expand Down
46 changes: 29 additions & 17 deletions ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
}

size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
// get_max_size is optional, defaults to UINT64_MAX
// get_max_size is optional, defaults to SIZE_MAX
if (buft->iface.get_max_size) {
return buft->iface.get_max_size(buft);
}
return UINT64_MAX;
return SIZE_MAX;
}

GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
Expand Down Expand Up @@ -564,7 +564,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to UINT64_MAX
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
Expand Down Expand Up @@ -620,7 +620,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to UINT64_MAX
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
Expand Down Expand Up @@ -775,24 +775,22 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, v
GGML_UNUSED(user_data);
}


// multi-buffer buffer

GGML_CALL const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
struct ggml_backend_multi_buffer_context {
ggml_backend_buffer_t * buffers;
size_t n_buffers;
};

return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
}
typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;

GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(size_t n_buffers, ggml_backend_buffer_type_t buft, size_t nbytes) {
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
ctx->n_buffers = n_buffers;
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;

return ggml_backend_buffer_init(buft, ggml_backend_multi_buffer_context_interface(), ctx, nbytes);
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
}

GGML_CALL void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
for (size_t i = 0; i < ctx->n_buffers; i++) {
ggml_backend_buffer_free(ctx->buffers[i]);
Expand All @@ -802,14 +800,14 @@ GGML_CALL void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffe
free(ctx);
}

GGML_CALL void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
for (size_t i = 0; i < ctx->n_buffers; i++) {
ggml_backend_buffer_clear(ctx->buffers[i], value);
}
}

struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
static struct ggml_backend_buffer_i multi_backend_buffer_i = {
/* .get_name = */ ggml_backend_multi_buffer_get_name,
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
Expand All @@ -825,6 +823,20 @@ struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
return multi_backend_buffer_i;
}

GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
ctx->n_buffers = n_buffers;
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));

size_t total_size = 0;
for (size_t i = 0; i < n_buffers; i++) {
ctx->buffers[i] = buffers[i];
total_size += ggml_backend_buffer_get_size(buffers[i]);
}

return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
}


// scheduler

Expand Down
6 changes: 3 additions & 3 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10428,7 +10428,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to UINT64_MAX
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
/* .is_host = */ NULL,
Expand Down Expand Up @@ -10704,7 +10704,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
/* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to UINT64_MAX
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
Expand Down Expand Up @@ -10784,7 +10784,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_max_size = */ NULL, // defaults to UINT64_MAX
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
Expand Down
2 changes: 1 addition & 1 deletion ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -2445,7 +2445,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to UINT64_MAX
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
Expand Down
4 changes: 2 additions & 2 deletions ggml-opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2055,7 +2055,7 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
/* .get_name = */ ggml_backend_opencl_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to UINT64_MAX
/* .get_max_size = */ NULL, // TODO: return from device info
/* .get_alloc_size = */ NULL,
/* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
/* .is_host = */ NULL,
Expand Down Expand Up @@ -2112,7 +2112,7 @@ ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
/* .get_name = */ ggml_backend_opencl_host_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_max_size = */ NULL, // defaults to UINT64_MAX
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
Expand Down
Loading

0 comments on commit 944184b

Please sign in to comment.