Merge branch 'master' into sycl_async_data_load
This commit is contained in:
commit
3a25182685
9 changed files with 302 additions and 160 deletions
|
@ -131,6 +131,7 @@ Typically finetunes of the base models below are supported as well.
|
|||
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
|
||||
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
|
||||
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
|
||||
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
|
||||
|
||||
**UI:**
|
||||
|
||||
|
|
|
@ -432,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
|||
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
|
||||
if (!image_embed_result) {
|
||||
clip_image_u8_free(img);
|
||||
LOG_ERR("%s: coulnd't embed the image\n", __func__);
|
||||
LOG_ERR("%s: couldn't embed the image\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
|
@ -24,6 +24,8 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
|
|||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -542,6 +542,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
|
|||
#include "ggml-sycl.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_VULKAN
|
||||
#include "ggml-vulkan.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_BLAS
|
||||
#include "ggml-blas.h"
|
||||
#endif
|
||||
|
@ -564,6 +568,9 @@ struct ggml_backend_registry {
|
|||
#ifdef GGML_USE_SYCL
|
||||
register_backend(ggml_backend_sycl_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_VULKAN
|
||||
register_backend(ggml_backend_vk_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_BLAS
|
||||
register_backend(ggml_backend_blas_reg());
|
||||
#endif
|
||||
|
@ -571,7 +578,7 @@ struct ggml_backend_registry {
|
|||
register_backend(ggml_backend_rpc_reg());
|
||||
#endif
|
||||
|
||||
// TODO: vulkan, kompute, cann
|
||||
// TODO: kompute, cann
|
||||
|
||||
register_backend(ggml_backend_cpu_reg());
|
||||
}
|
||||
|
@ -689,8 +696,6 @@ ggml_backend_t ggml_backend_init_best(void) {
|
|||
|
||||
// backend CPU
|
||||
|
||||
static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
|
||||
|
||||
static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
return "CPU";
|
||||
|
||||
|
@ -709,7 +714,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|||
}
|
||||
|
||||
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
free(buffer->context);
|
||||
ggml_aligned_free(buffer->context, buffer->size);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
|
@ -777,14 +782,19 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
|
|||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
||||
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
|
||||
auto alloc_size = size;
|
||||
if (alloc_size == 0) {
|
||||
alloc_size = 1;
|
||||
}
|
||||
|
||||
void * data = ggml_aligned_malloc(alloc_size);
|
||||
|
||||
if (data == NULL) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
|
||||
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
|
|
|
@ -19,6 +19,9 @@ extern "C" {
|
|||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
// required for mmap as gguf only guarantees 32-byte alignment
|
||||
#define TENSOR_ALIGNMENT 32
|
||||
|
||||
// static_assert should be a #define, but if it's not,
|
||||
// fall back to the _Static_assert C11 keyword.
|
||||
// if C99 - static_assert is noop
|
||||
|
@ -196,6 +199,11 @@ struct ggml_cgraph {
|
|||
|
||||
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||
|
||||
// Memory allocation
|
||||
|
||||
void * ggml_aligned_malloc(size_t size);
|
||||
void ggml_aligned_free(void * ptr, size_t size);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -1941,7 +1941,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
if (device->fp16) {
|
||||
device_extensions.push_back("VK_KHR_shader_float16_int8");
|
||||
}
|
||||
device->name = device->properties.deviceName.data();
|
||||
device->name = GGML_VK_NAME + std::to_string(idx);
|
||||
|
||||
device_create_info = {
|
||||
vk::DeviceCreateFlags(),
|
||||
|
@ -1968,7 +1968,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
|
||||
device->buffer_type = {
|
||||
/* .iface = */ ggml_backend_vk_buffer_type_interface,
|
||||
/* .device = */ nullptr,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), idx),
|
||||
/* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device },
|
||||
};
|
||||
|
||||
|
@ -6378,7 +6378,7 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||
},
|
||||
/* .device = */ nullptr,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), 0),
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
|
@ -6581,9 +6581,135 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|||
UNUSED(backend);
|
||||
}
|
||||
|
||||
static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||
// ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context;
|
||||
// TODO: enable async and synchronize
|
||||
static ggml_backend_i ggml_backend_vk_interface = {
|
||||
/* .get_name = */ ggml_backend_vk_name,
|
||||
/* .free = */ ggml_backend_vk_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
|
||||
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
|
||||
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
|
||||
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
||||
/* .supports_op = */ NULL,
|
||||
/* .supports_buft = */ NULL,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_vk_guid() {
|
||||
static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
|
||||
return &guid;
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
|
||||
|
||||
ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
|
||||
ggml_vk_init(ctx, dev_num);
|
||||
|
||||
ggml_backend_t vk_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_vk_guid(),
|
||||
/* .interface = */ ggml_backend_vk_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num),
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
return vk_backend;
|
||||
}
|
||||
|
||||
bool ggml_backend_is_vk(ggml_backend_t backend) {
|
||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
|
||||
}
|
||||
|
||||
int ggml_backend_vk_get_device_count() {
|
||||
return ggml_vk_get_device_count();
|
||||
}
|
||||
|
||||
void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
|
||||
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
||||
int dev_idx = vk_instance.device_indices[device];
|
||||
ggml_vk_get_device_description(dev_idx, description, description_size);
|
||||
}
|
||||
|
||||
void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
|
||||
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
||||
|
||||
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
||||
|
||||
vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
|
||||
|
||||
for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
|
||||
if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
|
||||
*total = heap.size;
|
||||
*free = heap.size;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////
|
||||
|
||||
struct ggml_backend_vk_device_context {
|
||||
int device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t dev) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
|
||||
ggml_backend_vk_get_device_memory(ctx->device, free, total);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
return ggml_backend_vk_buffer_type(ctx->device);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(ggml_backend_dev_t dev) {
|
||||
UNUSED(dev);
|
||||
return ggml_backend_vk_host_buffer_type();
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
|
||||
UNUSED(dev);
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
||||
}
|
||||
|
||||
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_vk_device_get_name(dev);
|
||||
props->description = ggml_backend_vk_device_get_description(dev);
|
||||
props->type = ggml_backend_vk_device_get_type(dev);
|
||||
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* async */ false,
|
||||
/* host_buffer */ true,
|
||||
/* events */ false,
|
||||
};
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
UNUSED(params);
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
return ggml_backend_vk_init(ctx->device);
|
||||
}
|
||||
|
||||
static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
switch (op->op) {
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
|
@ -6701,97 +6827,101 @@ static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tenso
|
|||
return false;
|
||||
}
|
||||
|
||||
UNUSED(backend);
|
||||
UNUSED(dev);
|
||||
}
|
||||
|
||||
static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||
static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
||||
|
||||
return buft_ctx->device->idx == ctx->device;
|
||||
}
|
||||
|
||||
static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||
const int min_batch_size = 32;
|
||||
|
||||
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
||||
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
||||
|
||||
UNUSED(backend);
|
||||
UNUSED(dev);
|
||||
}
|
||||
|
||||
static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||
|
||||
return buft_ctx->device == ctx->device;
|
||||
}
|
||||
|
||||
// TODO: enable async and synchronize
|
||||
static ggml_backend_i ggml_backend_vk_interface = {
|
||||
/* .get_name = */ ggml_backend_vk_name,
|
||||
/* .free = */ ggml_backend_vk_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
|
||||
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
|
||||
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
|
||||
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
/* .graph_plan_free = */ NULL,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_vk_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_vk_supports_buft,
|
||||
/* .offload_op = */ ggml_backend_vk_offload_op,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
|
||||
/* .get_name = */ ggml_backend_vk_device_get_name,
|
||||
/* .get_description = */ ggml_backend_vk_device_get_description,
|
||||
/* .get_memory = */ ggml_backend_vk_device_get_memory,
|
||||
/* .get_type = */ ggml_backend_vk_device_get_type,
|
||||
/* .get_props = */ ggml_backend_vk_device_get_props,
|
||||
/* .init_backend = */ ggml_backend_vk_device_init,
|
||||
/* .get_buffer_type = */ ggml_backend_vk_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ ggml_backend_vk_device_get_host_buffer_type,
|
||||
/* .buffer_from_host_ptr = */ NULL,
|
||||
/* .supports_op = */ ggml_backend_vk_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_vk_device_supports_buft,
|
||||
/* .offload_op = */ ggml_backend_vk_device_offload_op,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_vk_guid() {
|
||||
static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
|
||||
return &guid;
|
||||
static const char * ggml_backend_vk_reg_get_name(ggml_backend_reg_t reg) {
|
||||
UNUSED(reg);
|
||||
return GGML_VK_NAME;
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
||||
VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
|
||||
|
||||
ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
|
||||
ggml_vk_init(ctx, dev_num);
|
||||
|
||||
ggml_backend_t vk_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_vk_guid(),
|
||||
/* .interface = */ ggml_backend_vk_interface,
|
||||
/* .device = */ nullptr,
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
return vk_backend;
|
||||
static size_t ggml_backend_vk_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||
UNUSED(reg);
|
||||
return ggml_backend_vk_get_device_count();
|
||||
}
|
||||
|
||||
bool ggml_backend_is_vk(ggml_backend_t backend) {
|
||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
|
||||
}
|
||||
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, size_t device) {
|
||||
static std::vector<ggml_backend_dev_t> devices;
|
||||
|
||||
int ggml_backend_vk_get_device_count() {
|
||||
return ggml_vk_get_device_count();
|
||||
}
|
||||
static bool initialized = false;
|
||||
|
||||
void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
|
||||
ggml_vk_get_device_description(device, description, description_size);
|
||||
}
|
||||
|
||||
void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
|
||||
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
||||
|
||||
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
||||
|
||||
vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
|
||||
|
||||
for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
|
||||
if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
|
||||
*total = heap.size;
|
||||
*free = heap.size;
|
||||
break;
|
||||
{
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!initialized) {
|
||||
for (size_t i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
||||
char desc[256];
|
||||
ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
|
||||
ctx->device = i;
|
||||
ctx->name = GGML_VK_NAME + std::to_string(i);
|
||||
ctx->description = desc;
|
||||
devices.push_back(new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_vk_device_i,
|
||||
/* .reg = */ reg,
|
||||
/* .context = */ ctx,
|
||||
});
|
||||
}
|
||||
initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(device < devices.size());
|
||||
return devices[device];
|
||||
}
|
||||
|
||||
static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = {
|
||||
/* .get_name = */ ggml_backend_vk_reg_get_name,
|
||||
/* .get_device_count = */ ggml_backend_vk_reg_get_device_count,
|
||||
/* .get_device = */ ggml_backend_vk_reg_get_device,
|
||||
/* .get_proc_address = */ NULL,
|
||||
};
|
||||
|
||||
ggml_backend_reg_t ggml_backend_vk_reg() {
|
||||
static ggml_backend_reg reg = {
|
||||
/* .iface = */ ggml_backend_vk_reg_i,
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
return ®
|
||||
}
|
||||
|
||||
// Extension availability
|
||||
|
|
|
@ -35,10 +35,6 @@
|
|||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
#undef GGML_USE_LLAMAFILE
|
||||
#endif
|
||||
|
@ -189,6 +185,8 @@ typedef pthread_t ggml_thread_t;
|
|||
#endif
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#include <unistd.h>
|
||||
#include <mach/mach.h>
|
||||
#include <TargetConditionals.h>
|
||||
#endif
|
||||
|
||||
|
@ -386,22 +384,40 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
|
|||
//#define GGML_SOFT_MAX_ACCELERATE
|
||||
#endif
|
||||
|
||||
|
||||
void * ggml_aligned_malloc(size_t size) {
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
||||
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
||||
return _aligned_malloc(size, TENSOR_ALIGNMENT);
|
||||
#else
|
||||
inline static void * ggml_aligned_malloc(size_t size) {
|
||||
if (size == 0) {
|
||||
GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
|
||||
return NULL;
|
||||
}
|
||||
void * aligned_memory = NULL;
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
int result = hbw_posix_memalign(&aligned_memory, 16, size);
|
||||
int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
|
||||
#elif TARGET_OS_OSX
|
||||
kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
|
||||
int result = EFAULT;
|
||||
switch (alloc_status) {
|
||||
case KERN_SUCCESS:
|
||||
result = 0;
|
||||
break;
|
||||
case KERN_INVALID_ADDRESS:
|
||||
result = EINVAL;
|
||||
break;
|
||||
case KERN_NO_SPACE:
|
||||
result = ENOMEM;
|
||||
break;
|
||||
default:
|
||||
result = EFAULT;
|
||||
break;
|
||||
}
|
||||
#elif GGML_USE_METAL
|
||||
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
|
||||
const long page_size = sysconf(_SC_PAGESIZE);
|
||||
int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size);
|
||||
#else
|
||||
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
|
||||
int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
|
||||
#endif
|
||||
if (result != 0) {
|
||||
// Handle allocation failure
|
||||
|
@ -419,14 +435,26 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|||
return NULL;
|
||||
}
|
||||
return aligned_memory;
|
||||
#endif
|
||||
}
|
||||
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
|
||||
|
||||
void ggml_aligned_free(void * ptr, size_t size) {
|
||||
GGML_UNUSED(size);
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
_aligned_free(ptr);
|
||||
#elif GGML_USE_CPU_HBM
|
||||
if (ptr != NULL) {
|
||||
hbw_free(ptr);
|
||||
}
|
||||
#elif TARGET_OS_OSX
|
||||
if (ptr != NULL) {
|
||||
vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size);
|
||||
}
|
||||
#else
|
||||
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
||||
#endif
|
||||
free(ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
inline static void * ggml_malloc(size_t size) {
|
||||
if (size == 0) {
|
||||
|
@ -3869,7 +3897,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||
|
||||
*ctx = (struct ggml_context) {
|
||||
/*.mem_size =*/ mem_size,
|
||||
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
|
||||
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
|
||||
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
||||
/*.no_alloc =*/ params.no_alloc,
|
||||
/*.no_alloc_save =*/ params.no_alloc,
|
||||
|
@ -3909,7 +3937,7 @@ void ggml_free(struct ggml_context * ctx) {
|
|||
__func__, i, ggml_used_mem(ctx));
|
||||
|
||||
if (ctx->mem_buffer_owned) {
|
||||
GGML_ALIGNED_FREE(ctx->mem_buffer);
|
||||
ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
|
||||
}
|
||||
|
||||
found = true;
|
||||
|
@ -19608,9 +19636,10 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
|
|||
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
||||
if (!threadpool) return;
|
||||
|
||||
const int n_threads = threadpool->n_threads_max;
|
||||
|
||||
#ifndef GGML_USE_OPENMP
|
||||
struct ggml_compute_state* workers = threadpool->workers;
|
||||
const int n_threads = threadpool->n_threads_max;
|
||||
|
||||
ggml_mutex_lock(&threadpool->mutex);
|
||||
|
||||
|
@ -19630,8 +19659,9 @@ void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
|||
ggml_cond_destroy(&threadpool->cond);
|
||||
#endif // GGML_USE_OPENMP
|
||||
|
||||
GGML_ALIGNED_FREE(threadpool->workers);
|
||||
GGML_ALIGNED_FREE(threadpool);
|
||||
const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads;
|
||||
ggml_aligned_free(threadpool->workers, workers_size);
|
||||
ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool));
|
||||
}
|
||||
|
||||
#ifndef GGML_USE_OPENMP
|
||||
|
@ -20063,7 +20093,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
|||
struct ggml_cplan * cplan) {
|
||||
|
||||
struct ggml_threadpool * threadpool =
|
||||
GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
|
||||
ggml_aligned_malloc(sizeof(struct ggml_threadpool));
|
||||
{
|
||||
threadpool->cgraph = cgraph;
|
||||
threadpool->cplan = cplan;
|
||||
|
@ -20084,7 +20114,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
|||
|
||||
// Allocate and init workers state
|
||||
const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
|
||||
struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size);
|
||||
struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size);
|
||||
|
||||
memset(workers, 0, workers_size);
|
||||
for (int j = 0; j < tpp->n_threads; j++) {
|
||||
|
|
|
@ -221,7 +221,7 @@ struct llm_tokenizer_spm_session {
|
|||
}
|
||||
|
||||
// seed the work queue with all possible 2-character tokens.
|
||||
for (size_t i = 1; i < symbols.size(); ++i) {
|
||||
for (int i = 1; i < (int) symbols.size(); ++i) {
|
||||
try_add_bigram(i - 1, i);
|
||||
}
|
||||
|
||||
|
@ -563,7 +563,7 @@ struct llm_tokenizer_bpe_session {
|
|||
index++;
|
||||
symbols.emplace_back(sym);
|
||||
}
|
||||
for (size_t i = 1; i < symbols.size(); ++i) {
|
||||
for (int i = 1; i < (int) symbols.size(); ++i) {
|
||||
add_new_bigram(i - 1, i);
|
||||
}
|
||||
|
||||
|
|
|
@ -8,11 +8,7 @@
|
|||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#if defined(GGML_USE_VULKAN)
|
||||
# include "ggml-vulkan.h"
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
# include "ggml-sycl.h"
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
#if defined(GGML_USE_KOMPUTE)
|
||||
# include "ggml-kompute.h"
|
||||
#elif defined(GGML_USE_CANN)
|
||||
# include "ggml-cann.h"
|
||||
|
@ -3420,11 +3416,7 @@ static int llama_get_device_count(const llama_model & model) {
|
|||
count += (int) model.rpc_servers.size();
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_SYCL)
|
||||
count += ggml_backend_sycl_get_device_count();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
count += ggml_backend_vk_get_device_count();
|
||||
#elif defined(GGML_USE_CANN)
|
||||
#if defined(GGML_USE_CANN)
|
||||
count += ggml_backend_cann_get_device_count();
|
||||
#endif
|
||||
|
||||
|
@ -3451,10 +3443,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode
|
|||
}
|
||||
#elif defined(GGML_USE_CPU_HBM)
|
||||
buft = ggml_backend_cpu_hbm_buffer_type();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
if (host_buffer) {
|
||||
buft = ggml_backend_vk_host_buffer_type();
|
||||
}
|
||||
#endif
|
||||
|
||||
if (buft == nullptr) {
|
||||
|
@ -3473,9 +3461,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
|||
}
|
||||
device -= (int)model.devices.size();
|
||||
|
||||
#if defined(GGML_USE_VULKAN)
|
||||
buft = ggml_backend_vk_buffer_type(device);
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
#if defined(GGML_USE_KOMPUTE)
|
||||
buft = ggml_backend_kompute_buffer_type(device);
|
||||
#elif defined(GGML_USE_CANN)
|
||||
buft = ggml_backend_cann_buffer_type(device);
|
||||
|
@ -3522,12 +3508,7 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
|
|||
return free;
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_VULKAN)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_vk_get_device_memory(device, &free, &total);
|
||||
return free;
|
||||
#elif defined(GGML_USE_CANN)
|
||||
#if defined(GGML_USE_CANN)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_cann_get_device_memory(device, &free, &total);
|
||||
|
@ -19082,7 +19063,7 @@ bool llama_supports_mlock(void) {
|
|||
}
|
||||
|
||||
bool llama_supports_gpu_offload(void) {
|
||||
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE)
|
||||
#if defined(GGML_USE_KOMPUTE)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
return true;
|
||||
#else
|
||||
|
@ -19213,8 +19194,13 @@ struct llama_model * llama_load_model_from_file(
|
|||
|
||||
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||
case GGML_BACKEND_DEVICE_TYPE_GPU_FULL:
|
||||
{
|
||||
size_t free, total; // NOLINT
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
|
||||
model->devices.push_back(dev);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -19409,32 +19395,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
main_gpu -= (int)model->devices.size();
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_VULKAN)
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
||||
ggml_backend_t backend = ggml_backend_vk_init(main_gpu);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
} else {
|
||||
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
||||
ggml_backend_t backend = ggml_backend_vk_init(device);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
}
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
#if defined(GGML_USE_KOMPUTE)
|
||||
if (model->n_gpu_layers > 0) {
|
||||
auto * backend = ggml_backend_kompute_init(main_gpu);
|
||||
if (backend == nullptr) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue