add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data
This commit is contained in:
parent
1ac01fbbd1
commit
c8bd5d8b65
6 changed files with 67 additions and 28 deletions
|
@ -20,6 +20,9 @@ extern "C" {
|
||||||
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
||||||
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
||||||
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
||||||
|
// check if tensor data is in host memory
|
||||||
|
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
||||||
|
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_buffer_type {
|
struct ggml_backend_buffer_type {
|
||||||
|
@ -79,7 +82,7 @@ extern "C" {
|
||||||
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
|
||||||
void (*synchronize) (ggml_backend_t backend);
|
void (*synchronize)(ggml_backend_t backend);
|
||||||
|
|
||||||
// compute graph with a plan
|
// compute graph with a plan
|
||||||
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
|
|
@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba
|
||||||
return buft->iface.supports_backend(buft, backend);
|
return buft->iface.supports_backend(buft, backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
|
if (buft->iface.is_host) {
|
||||||
|
return buft->iface.is_host(buft);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// backend buffer
|
// backend buffer
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_buffer_init(
|
ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||||
|
@ -98,6 +105,10 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
buffer->iface.clear(buffer, value);
|
buffer->iface.clear(buffer, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
||||||
|
return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
|
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
|
||||||
return buffer->buft;
|
return buffer->buft;
|
||||||
}
|
}
|
||||||
|
@ -464,6 +475,12 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
|
||||||
GGML_UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
|
return true;
|
||||||
|
|
||||||
|
GGML_UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
||||||
/* .iface = */ {
|
/* .iface = */ {
|
||||||
|
@ -471,6 +488,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
||||||
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
},
|
},
|
||||||
/* .context = */ NULL,
|
/* .context = */ NULL,
|
||||||
};
|
};
|
||||||
|
@ -479,9 +497,11 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_CPU_HBM
|
#ifdef GGML_USE_CPU_HBM
|
||||||
|
|
||||||
|
// buffer type HBM
|
||||||
|
|
||||||
#include <hbwmalloc.h>
|
#include <hbwmalloc.h>
|
||||||
|
|
||||||
// HBM buffer type
|
|
||||||
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
hbw_free(buffer->context);
|
hbw_free(buffer->context);
|
||||||
}
|
}
|
||||||
|
@ -503,16 +523,15 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_backend_buffer_type_i cpu_backend_hbm_buffer_type_interface = {
|
|
||||||
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
|
||||||
};
|
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
|
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
||||||
/* .iface = */ cpu_backend_hbm_buffer_type_interface,
|
/* .iface = */ {
|
||||||
|
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
||||||
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
||||||
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
|
},
|
||||||
/* .context = */ NULL,
|
/* .context = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,7 @@ extern "C" {
|
||||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
||||||
|
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||||
|
|
||||||
// buffer
|
// buffer
|
||||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||||
|
@ -30,6 +31,7 @@ extern "C" {
|
||||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||||
|
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
16
ggml-cuda.cu
16
ggml-cuda.cu
|
@ -9568,6 +9568,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
||||||
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
||||||
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
||||||
|
/* .is_host = */ nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||||
|
@ -9606,16 +9607,15 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_backend_buffer_type_i ggml_backend_cuda_host_buffer_type_interface = {
|
|
||||||
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
|
||||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
|
||||||
};
|
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
||||||
/* .iface = */ ggml_backend_cuda_host_buffer_type_interface,
|
/* .iface = */ {
|
||||||
|
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
||||||
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||||
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||||
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
||||||
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||||
|
},
|
||||||
/* .context = */ nullptr,
|
/* .context = */ nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -2521,6 +2521,12 @@ static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_
|
||||||
UNUSED(buft);
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
|
return true;
|
||||||
|
|
||||||
|
UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
|
static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
|
||||||
/* .iface = */ {
|
/* .iface = */ {
|
||||||
|
@ -2528,6 +2534,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
||||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
||||||
|
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
||||||
},
|
},
|
||||||
/* .context = */ NULL,
|
/* .context = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
28
llama.cpp
28
llama.cpp
|
@ -1539,7 +1539,7 @@ static bool llama_kv_cache_init(
|
||||||
ggml_cuda_assign_buffers_no_scratch(v);
|
ggml_cuda_assign_buffers_no_scratch(v);
|
||||||
vram_kv_cache += ggml_nbytes(k);
|
vram_kv_cache += ggml_nbytes(k);
|
||||||
vram_kv_cache += ggml_nbytes(v);
|
vram_kv_cache += ggml_nbytes(v);
|
||||||
// HACK: mark tensor as allocated, but crash if we try to use it from the CPU
|
// HACK: mark tensor as allocated
|
||||||
k->data = v->data = (void *)(uintptr_t)1;
|
k->data = v->data = (void *)(uintptr_t)1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2285,9 +2285,15 @@ struct llama_model_loader {
|
||||||
ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + offs, 0, ggml_nbytes(cur));
|
ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + offs, 0, ggml_nbytes(cur));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// FIXME: use read_buf for device buffers without unified memory
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
||||||
file.seek(offs, SEEK_SET);
|
file.seek(offs, SEEK_SET);
|
||||||
file.read_raw(cur->data, ggml_nbytes(cur));
|
file.read_raw(cur->data, ggml_nbytes(cur));
|
||||||
|
} else {
|
||||||
|
read_buf.resize(ggml_nbytes(cur));
|
||||||
|
file.seek(offs, SEEK_SET);
|
||||||
|
file.read_raw(read_buf.data(), ggml_nbytes(cur));
|
||||||
|
ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (use_mmap && lmlock) {
|
if (use_mmap && lmlock) {
|
||||||
|
@ -2298,7 +2304,7 @@ struct llama_model_loader {
|
||||||
|
|
||||||
case GGML_BACKEND_GPU:
|
case GGML_BACKEND_GPU:
|
||||||
case GGML_BACKEND_GPU_SPLIT: {
|
case GGML_BACKEND_GPU_SPLIT: {
|
||||||
// HACK: mark tensor as allocated, but crash if we try to use it from the CPU
|
// HACK: mark tensor as allocated
|
||||||
cur->data = (void *)(uintptr_t)1;
|
cur->data = (void *)(uintptr_t)1;
|
||||||
void * data;
|
void * data;
|
||||||
if (use_mmap) {
|
if (use_mmap) {
|
||||||
|
@ -5773,7 +5779,7 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
const int64_t n_tokens = cur->ne[1];
|
const int64_t n_tokens = cur->ne[1];
|
||||||
|
|
||||||
float * data;
|
float * data;
|
||||||
if (/*is_sys_mem_buf(cur->buffer)*/false) { // TODO
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
||||||
data = (float *) cur->data;
|
data = (float *) cur->data;
|
||||||
} else {
|
} else {
|
||||||
lctx.buf_copy.resize(ggml_nbytes(cur));
|
lctx.buf_copy.resize(ggml_nbytes(cur));
|
||||||
|
@ -5812,7 +5818,7 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
const int64_t n_ctx = cur->ne[0];
|
const int64_t n_ctx = cur->ne[0];
|
||||||
|
|
||||||
int32_t * data;
|
int32_t * data;
|
||||||
if (/*is_sys_mem_buf(cur->buffer)*/false) { // TODO
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
||||||
data = (int32_t *) cur->data;
|
data = (int32_t *) cur->data;
|
||||||
} else {
|
} else {
|
||||||
lctx.buf_copy.resize(ggml_nbytes(cur));
|
lctx.buf_copy.resize(ggml_nbytes(cur));
|
||||||
|
@ -9230,13 +9236,15 @@ struct llama_context * llama_new_context_with_model(
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (ctx->backend == nullptr) {
|
if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) {
|
||||||
// FIXME: this may fail if the model buffer is not compatible with the CPU backend
|
|
||||||
ctx->backend = ggml_backend_cpu_init();
|
ctx->backend = ggml_backend_cpu_init();
|
||||||
|
if (ctx->backend == nullptr) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx->backend == nullptr) {
|
if (ctx->backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__);
|
||||||
delete ctx;
|
delete ctx;
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue