Workaround Metal maxBufferLength
This commit is contained in:
parent
fa84c4b3e8
commit
b2c0973b44
4 changed files with 32 additions and 61 deletions
|
@ -70,7 +70,6 @@ int main(int argc, char ** argv) {
|
|||
// debug output
|
||||
{
|
||||
struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
|
||||
ggml_metal_get_tensor(ctx_metal, logits);
|
||||
|
||||
float * ptr = (float *) ggml_get_data(logits);
|
||||
|
||||
|
|
|
@ -13,9 +13,6 @@
|
|||
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
|
||||
// used during the graph evaluation to determine the arguments of the compute kernels.
|
||||
//
|
||||
// Synchronization between device and host memory (for example for input and output tensors)
|
||||
// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
|
@ -48,12 +45,6 @@ bool ggml_metal_add_buffer(
|
|||
void * data,
|
||||
size_t size);
|
||||
|
||||
// set data from host memory into the device
|
||||
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
||||
|
||||
// get data from the device into host memory
|
||||
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
||||
|
||||
// same as ggml_graph_compute but uses Metal
|
||||
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
||||
|
||||
|
|
73
ggml-metal.m
73
ggml-metal.m
|
@ -223,56 +223,47 @@ bool ggml_metal_add_buffer(
|
|||
}
|
||||
|
||||
size_t page_size = getpagesize();
|
||||
size_t aligned_size = size;
|
||||
if ((aligned_size % page_size) != 0) {
|
||||
aligned_size += (page_size - (aligned_size % page_size));
|
||||
size_t sys_max_buffer_size = 2ul * 1024ul * 1024ul * 1024ul; // ctx->device.maxBufferLength;
|
||||
|
||||
// Make sure total size is page-aligned
|
||||
size_t total_aligned_size = size;
|
||||
if ((total_aligned_size % page_size) != 0) {
|
||||
total_aligned_size += (page_size - (total_aligned_size % page_size));
|
||||
}
|
||||
|
||||
ctx->buffers[ctx->n_buffers].name = name;
|
||||
ctx->buffers[ctx->n_buffers].data = data;
|
||||
ctx->buffers[ctx->n_buffers].size = size;
|
||||
|
||||
if (ctx->device.maxBufferLength < aligned_size) {
|
||||
fprintf(stderr, "%s: buffer '%s' size %zu is larger than buffer maximum of %zu\n", __func__, name, aligned_size, ctx->device.maxBufferLength);
|
||||
return false;
|
||||
}
|
||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
|
||||
|
||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
|
||||
return false;
|
||||
} else {
|
||||
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
|
||||
// Make sure chunk size is page-aligned
|
||||
size_t max_chunk_size = sys_max_buffer_size / 2;
|
||||
if ((max_chunk_size % page_size) != 0) {
|
||||
max_chunk_size += (page_size - (max_chunk_size % page_size));
|
||||
}
|
||||
|
||||
++ctx->n_buffers;
|
||||
size_t chunk_offset = 0;
|
||||
while (total_aligned_size > 0) {
|
||||
size_t chunk_logical_size = (max_chunk_size > total_aligned_size) ? total_aligned_size : max_chunk_size;
|
||||
size_t sys_buffer_size = (sys_max_buffer_size > total_aligned_size) ? total_aligned_size : sys_max_buffer_size;
|
||||
void *chunk = (uint8_t *) data + chunk_offset;
|
||||
ctx->buffers[ctx->n_buffers].name = name;
|
||||
ctx->buffers[ctx->n_buffers].data = chunk;
|
||||
ctx->buffers[ctx->n_buffers].size = chunk_logical_size;
|
||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:chunk length:sys_buffer_size options:MTLResourceStorageModeShared deallocator:nil];
|
||||
|
||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name,
|
||||
sys_buffer_size / 1024.0 / 1024.0);
|
||||
return false;
|
||||
} else {
|
||||
fprintf(stderr, "%s: allocated '%-16s' buffer, sys_size = %8.2f MB, size = %8.2f MB, max: %zu\n", __func__, name,
|
||||
sys_buffer_size / 1024.0 / 1024.0, chunk_logical_size / 1024.0 / 1024.0, sys_max_buffer_size);
|
||||
}
|
||||
++ctx->n_buffers;
|
||||
total_aligned_size -= chunk_logical_size;
|
||||
chunk_offset += chunk_logical_size;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void ggml_metal_set_tensor(
|
||||
struct ggml_metal_context * ctx,
|
||||
struct ggml_tensor * t) {
|
||||
metal_printf("%s: set input for tensor '%s'\n", __func__, t->name);
|
||||
|
||||
size_t offs;
|
||||
id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
|
||||
|
||||
memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t));
|
||||
}
|
||||
|
||||
void ggml_metal_get_tensor(
|
||||
struct ggml_metal_context * ctx,
|
||||
struct ggml_tensor * t) {
|
||||
metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name);
|
||||
|
||||
size_t offs;
|
||||
id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
|
||||
|
||||
memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
|
||||
}
|
||||
|
||||
void ggml_metal_graph_compute(
|
||||
struct ggml_metal_context * ctx,
|
||||
struct ggml_cgraph * gf) {
|
||||
|
|
10
llama.cpp
10
llama.cpp
|
@ -1588,7 +1588,6 @@ static bool llama_eval_internal(
|
|||
#ifdef GGML_USE_METAL
|
||||
if (lctx.ctx_metal && N == 1) {
|
||||
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
||||
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
||||
} else {
|
||||
// IMPORTANT:
|
||||
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
||||
|
@ -1597,15 +1596,6 @@ static bool llama_eval_internal(
|
|||
//
|
||||
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
|
||||
// But for now, we have focused only on Matrix x Vector Metal multiplication.
|
||||
//
|
||||
// TODO: avoid these syncs via shared memory (ref #1696)
|
||||
//
|
||||
if (lctx.ctx_metal) {
|
||||
// We need to sync the GPU KV cache with the CPU KV cache
|
||||
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
|
||||
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
||||
}
|
||||
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
}
|
||||
#else
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue