add ggml-metal
This commit is contained in:
parent
8e6735ec60
commit
0808aa5a42
4 changed files with 176 additions and 48 deletions
|
@ -9428,8 +9428,6 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
|
@ -9441,8 +9439,6 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
|
@ -9600,8 +9596,6 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
|
||||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
||||||
|
|
||||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
||||||
|
@ -9611,8 +9605,6 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
|
||||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
||||||
|
|
||||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
||||||
|
|
|
@ -98,7 +98,10 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
||||||
|
|
||||||
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||||
|
|
||||||
// helper to check if the device supports a specific family
|
// helper to check if the device supports a specific family
|
||||||
|
|
180
ggml-metal.m
180
ggml-metal.m
|
@ -607,12 +607,24 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// temporarily defined here for compatibility between ggml-backend and the old API
|
// temporarily defined here for compatibility between ggml-backend and the old API
|
||||||
struct ggml_backend_metal_buffer_context {
|
|
||||||
void * data;
|
struct ggml_backend_metal_buffer {
|
||||||
|
void * data;
|
||||||
|
size_t size;
|
||||||
|
|
||||||
id<MTLBuffer> metal;
|
id<MTLBuffer> metal;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct ggml_backend_metal_buffer_context {
|
||||||
|
void * all_data;
|
||||||
|
size_t all_size;
|
||||||
|
bool owned;
|
||||||
|
|
||||||
|
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
|
||||||
|
int n_buffers;
|
||||||
|
struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
||||||
|
};
|
||||||
|
|
||||||
// finds the Metal buffer that contains the tensor data on the GPU device
|
// finds the Metal buffer that contains the tensor data on the GPU device
|
||||||
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
||||||
// Metal buffer based on the host memory pointer
|
// Metal buffer based on the host memory pointer
|
||||||
|
@ -622,17 +634,29 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
||||||
|
|
||||||
const int64_t tsize = ggml_nbytes(t);
|
const int64_t tsize = ggml_nbytes(t);
|
||||||
|
|
||||||
|
ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
|
||||||
|
|
||||||
// compatibility with ggml-backend
|
// compatibility with ggml-backend
|
||||||
if (t->buffer && t->buffer->buft == ggml_backend_metal_buffer_type()) {
|
if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
|
||||||
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) t->buffer->context;
|
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
|
||||||
|
|
||||||
const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->data;
|
// find the view that contains the tensor fully
|
||||||
|
for (int i = 0; i < buf_ctx->n_buffers; ++i) {
|
||||||
|
const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
|
||||||
|
|
||||||
GGML_ASSERT(ioffs >= 0 && ioffs + tsize <= (int64_t) t->buffer->size);
|
//GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
|
||||||
|
if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
|
||||||
|
*offs = (size_t) ioffs;
|
||||||
|
|
||||||
*offs = (size_t) ioffs;
|
//GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
|
||||||
|
|
||||||
return buf_ctx->metal;
|
return buf_ctx->buffers[i].metal;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
|
||||||
|
|
||||||
|
return nil;
|
||||||
}
|
}
|
||||||
|
|
||||||
// find the view that contains the tensor fully
|
// find the view that contains the tensor fully
|
||||||
|
@ -2361,6 +2385,7 @@ void ggml_metal_graph_compute(
|
||||||
|
|
||||||
// backend interface
|
// backend interface
|
||||||
|
|
||||||
|
// default buffer
|
||||||
static id<MTLDevice> g_backend_device = nil;
|
static id<MTLDevice> g_backend_device = nil;
|
||||||
static int g_backend_device_ref_count = 0;
|
static int g_backend_device_ref_count = 0;
|
||||||
|
|
||||||
|
@ -2388,34 +2413,33 @@ static void ggml_backend_metal_free_device(void) {
|
||||||
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
return ctx->data;
|
return ctx->all_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
[ctx->metal release];
|
for (int i = 0; i < ctx->n_buffers; i++) {
|
||||||
|
[ctx->buffers[i].metal release];
|
||||||
|
}
|
||||||
ggml_backend_metal_free_device();
|
ggml_backend_metal_free_device();
|
||||||
|
|
||||||
free(ctx->data);
|
if (ctx->owned) {
|
||||||
|
free(ctx->all_data);
|
||||||
|
}
|
||||||
|
|
||||||
free(ctx);
|
free(ctx);
|
||||||
|
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
|
|
||||||
memcpy((char *)tensor->data + offset, data, size);
|
memcpy((char *)tensor->data + offset, data, size);
|
||||||
|
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
|
|
||||||
memcpy(data, (const char *)tensor->data + offset, size);
|
memcpy(data, (const char *)tensor->data + offset, size);
|
||||||
|
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
|
@ -2433,7 +2457,7 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_backend_buffer_i metal_backend_buffer_i = {
|
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
||||||
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
||||||
/* .init_tensor = */ NULL,
|
/* .init_tensor = */ NULL,
|
||||||
|
@ -2443,6 +2467,8 @@ static struct ggml_backend_buffer_i metal_backend_buffer_i = {
|
||||||
/* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
|
/* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// default buffer type
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
||||||
|
|
||||||
|
@ -2453,13 +2479,30 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
||||||
size_aligned += (size_page - (size_aligned % size_page));
|
size_aligned += (size_page - (size_aligned % size_page));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->data = ggml_metal_host_malloc(size);
|
id<MTLDevice> device = ggml_backend_metal_get_device();
|
||||||
ctx->metal = [ggml_backend_metal_get_device() newBufferWithBytesNoCopy:ctx->data
|
|
||||||
|
ctx->all_data = ggml_metal_host_malloc(size_aligned);
|
||||||
|
ctx->all_size = size_aligned;
|
||||||
|
ctx->owned = true;
|
||||||
|
ctx->n_buffers = 1;
|
||||||
|
|
||||||
|
ctx->buffers[0].data = ctx->all_data;
|
||||||
|
ctx->buffers[0].size = size;
|
||||||
|
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
||||||
length:size_aligned
|
length:size_aligned
|
||||||
options:MTLResourceStorageModeShared
|
options:MTLResourceStorageModeShared
|
||||||
deallocator:nil];
|
deallocator:nil];
|
||||||
|
|
||||||
return ggml_backend_buffer_init(buft, metal_backend_buffer_i, ctx, size);
|
if (ctx->buffers[0].metal == nil) {
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, "default", size_aligned / 1024.0 / 1024.0);
|
||||||
|
free(ctx);
|
||||||
|
ggml_backend_metal_free_device();
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB\n", __func__, "default", size_aligned / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||||
|
@ -2470,7 +2513,7 @@ static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_t
|
||||||
static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||||
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
||||||
|
@ -2487,6 +2530,91 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
||||||
return &ggml_backend_buffer_type_metal;
|
return &ggml_backend_buffer_type_metal;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// buffer from ptr
|
||||||
|
|
||||||
|
ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
|
||||||
|
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
||||||
|
|
||||||
|
ctx->all_data = data;
|
||||||
|
ctx->all_size = size;
|
||||||
|
ctx->owned = false;
|
||||||
|
ctx->n_buffers = 0;
|
||||||
|
|
||||||
|
const size_t size_page = sysconf(_SC_PAGESIZE);
|
||||||
|
size_t size_aligned = size;
|
||||||
|
if ((size_aligned % size_page) != 0) {
|
||||||
|
size_aligned += (size_page - (size_aligned % size_page));
|
||||||
|
}
|
||||||
|
|
||||||
|
id<MTLDevice> device = ggml_backend_metal_get_device();
|
||||||
|
|
||||||
|
const char * name = "from_ptr";
|
||||||
|
|
||||||
|
// the buffer fits into the max buffer size allowed by the device
|
||||||
|
if (size_aligned <= device.maxBufferLength) {
|
||||||
|
ctx->buffers[ctx->n_buffers].name = name;
|
||||||
|
ctx->buffers[ctx->n_buffers].data = data;
|
||||||
|
ctx->buffers[ctx->n_buffers].size = size;
|
||||||
|
|
||||||
|
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
++ctx->n_buffers;
|
||||||
|
} else {
|
||||||
|
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
|
||||||
|
// one of the views
|
||||||
|
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
|
||||||
|
const size_t size_step = device.maxBufferLength - size_ovlp;
|
||||||
|
const size_t size_view = device.maxBufferLength;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < size; i += size_step) {
|
||||||
|
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
||||||
|
|
||||||
|
ctx->buffers[ctx->n_buffers].name = name;
|
||||||
|
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
||||||
|
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
||||||
|
|
||||||
|
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
||||||
|
if (i + size_step < size) {
|
||||||
|
GGML_METAL_LOG_INFO("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
++ctx->n_buffers;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if TARGET_OS_OSX
|
||||||
|
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
||||||
|
device.currentAllocatedSize / 1024.0 / 1024.0,
|
||||||
|
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
||||||
|
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
||||||
|
} else {
|
||||||
|
GGML_METAL_LOG_INFO("\n");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
// backend
|
||||||
|
|
||||||
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
||||||
return "Metal";
|
return "Metal";
|
||||||
|
|
||||||
|
@ -2499,10 +2627,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
|
||||||
free(backend);
|
free(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
|
|
||||||
UNUSED(backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
||||||
return ggml_backend_metal_buffer_type();
|
return ggml_backend_metal_buffer_type();
|
||||||
|
|
||||||
|
@ -2529,8 +2653,8 @@ static struct ggml_backend_i metal_backend_i = {
|
||||||
/* .get_tensor_async = */ NULL,
|
/* .get_tensor_async = */ NULL,
|
||||||
/* .cpy_tensor_from_async = */ NULL,
|
/* .cpy_tensor_from_async = */ NULL,
|
||||||
/* .cpy_tensor_to_async = */ NULL,
|
/* .cpy_tensor_to_async = */ NULL,
|
||||||
/* .synchronize = */ ggml_backend_metal_synchronize,
|
/* .synchronize = */ NULL,
|
||||||
/* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm
|
/* .graph_plan_create = */ NULL,
|
||||||
/* .graph_plan_free = */ NULL,
|
/* .graph_plan_free = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
||||||
|
|
33
llama.cpp
33
llama.cpp
|
@ -2881,17 +2881,20 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: metal should be disabled with ngl=0 -> cpu_buffer_type
|
// TODO: metal should be disabled with ngl=0 -> cpu_buffer_type
|
||||||
static ggml_backend_buffer_type_t llama_default_buffer_type() {
|
static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
return ggml_backend_metal_buffer_type();
|
if (n_gpu_layers > 0) {
|
||||||
|
return ggml_backend_metal_buffer_type();
|
||||||
|
}
|
||||||
#elif GGML_USE_CUBLAS
|
#elif GGML_USE_CUBLAS
|
||||||
printf("Using " GGML_CUDA_NAME " host buffer type\n");
|
|
||||||
return ggml_backend_cuda_host_buffer_type();
|
return ggml_backend_cuda_host_buffer_type();
|
||||||
#elif GGML_USE_CPU_HBM
|
#elif GGML_USE_CPU_HBM
|
||||||
return ggml_backend_cpu_hbm_buffer_type();
|
return ggml_backend_cpu_hbm_buffer_type();
|
||||||
#else
|
|
||||||
return ggml_backend_cpu_buffer_type();
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
return ggml_backend_cpu_buffer_type();
|
||||||
|
|
||||||
|
GGML_UNUSED(n_gpu_layers);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llm_load_tensors(
|
static void llm_load_tensors(
|
||||||
|
@ -3430,7 +3433,7 @@ static void llm_load_tensors(
|
||||||
size_t vram_weights = 0;
|
size_t vram_weights = 0;
|
||||||
size_t buf_size = 0;
|
size_t buf_size = 0;
|
||||||
|
|
||||||
ggml_backend_buffer_type_t buft = llama_default_buffer_type();
|
ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers);
|
||||||
|
|
||||||
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
if (t->backend == GGML_BACKEND_CPU) {
|
if (t->backend == GGML_BACKEND_CPU) {
|
||||||
|
@ -3440,13 +3443,14 @@ static void llm_load_tensors(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// create backend buffer
|
// create backend buffer
|
||||||
bool sys_mem_buf = false;
|
bool sys_mem_buf = false;
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
|
// todo: disable with 0 gpu layers
|
||||||
if (ml.use_mmap) {
|
if (ml.use_mmap) {
|
||||||
model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size);
|
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
||||||
|
model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size);
|
||||||
} else {
|
} else {
|
||||||
model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
|
model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
|
||||||
sys_mem_buf = true;
|
sys_mem_buf = true;
|
||||||
|
@ -5940,12 +5944,12 @@ static int llama_decode_internal(
|
||||||
if (ggml_backend_is_metal(lctx.backend)) {
|
if (ggml_backend_is_metal(lctx.backend)) {
|
||||||
ggml_backend_metal_set_n_cb(lctx.backend, n_threads);
|
ggml_backend_metal_set_n_cb(lctx.backend, n_threads);
|
||||||
}
|
}
|
||||||
#else
|
#endif
|
||||||
|
|
||||||
if (ggml_backend_is_cpu(lctx.backend)) {
|
if (ggml_backend_is_cpu(lctx.backend)) {
|
||||||
ggml_backend_cpu_set_n_threads(lctx.backend, n_threads);
|
ggml_backend_cpu_set_n_threads(lctx.backend, n_threads);
|
||||||
}
|
}
|
||||||
ggml_backend_graph_compute(lctx.backend, gf);
|
ggml_backend_graph_compute(lctx.backend, gf);
|
||||||
#endif
|
|
||||||
|
|
||||||
#if GGML_USE_MPI
|
#if GGML_USE_MPI
|
||||||
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
||||||
|
@ -8952,6 +8956,9 @@ struct llama_context * llama_new_context_with_model(
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (model->n_gpu_layers > 0) {
|
if (model->n_gpu_layers > 0) {
|
||||||
ctx->backend = ggml_backend_metal_init();
|
ctx->backend = ggml_backend_metal_init();
|
||||||
|
if (ctx->backend == nullptr) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
if (ctx->backend == nullptr) {
|
if (ctx->backend == nullptr) {
|
||||||
|
@ -8959,12 +8966,14 @@ struct llama_context * llama_new_context_with_model(
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->backend == nullptr) {
|
if (ctx->backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize backend\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
||||||
delete ctx;
|
delete ctx;
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv, llama_default_buffer_type())) {
|
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v,
|
||||||
|
cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv,
|
||||||
|
llama_default_buffer_type(model->n_gpu_layers))) {
|
||||||
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue