kompute : better device management

This commit is contained in:
Jared Van Bortel 2024-01-29 12:07:35 -05:00
parent 530462550d
commit be7c0559d3
4 changed files with 150 additions and 122 deletions

View file

@ -346,9 +346,8 @@ GGML_CALL static void ggml_backend_registry_init(void) {
#endif #endif
#ifdef GGML_USE_KOMPUTE #ifdef GGML_USE_KOMPUTE
extern ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data); extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
extern ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void); ggml_backend_kompute_reg_devices();
ggml_backend_register("Kompute", ggml_backend_reg_kompute_init, ggml_backend_kompute_buffer_type(), NULL);
#endif #endif
} }

View file

@ -60,8 +60,18 @@
#define QK_NL 16 #define QK_NL 16
typedef ggml_fp16_t half; typedef ggml_fp16_t half;
static std::string ggml_kompute_format_name(int device) {
return "Kompute" + std::to_string(device);
}
struct ggml_kompute_context { struct ggml_kompute_context {
int device;
std::string name;
std::shared_ptr<vk::DescriptorPool> pool; std::shared_ptr<vk::DescriptorPool> pool;
ggml_kompute_context(int device)
: device(device), name(ggml_kompute_format_name(device)) {}
}; };
// FIXME: It would be good to consolidate the kompute manager and the kompute context into one object // FIXME: It would be good to consolidate the kompute manager and the kompute context into one object
@ -210,6 +220,7 @@ static std::vector<ggml_vk_device> ggml_vk_available_devices_internal(size_t mem
d.heapSize = heapSize; d.heapSize = heapSize;
d.vendor = strdup(ggml_vk_getVendorName(properties.vendorID)); d.vendor = strdup(ggml_vk_getVendorName(properties.vendorID));
d.subgroupSize = subgroupProperties.subgroupSize; d.subgroupSize = subgroupProperties.subgroupSize;
d.bufferAlignment = properties.limits.minStorageBufferOffsetAlignment;
std::string name(properties.deviceName); std::string name(properties.deviceName);
size_t n_idx = ++count_by_name[name]; size_t n_idx = ++count_by_name[name];
@ -271,42 +282,28 @@ static void ggml_vk_filterByName(std::vector<ggml_vk_device>& devices, const std
); );
} }
static bool ggml_vk_init_device(size_t memoryRequired, const std::string & device) { static bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const std::string & name) {
if (device.empty()) if (name.empty())
return false; return false;
auto devices = ggml_vk_available_devices_internal(memoryRequired); auto devices = ggml_vk_available_devices_internal(memoryRequired);
if (device == "amd" || device == "nvidia" || device == "intel") { if (name == "amd" || name == "nvidia" || name == "intel") {
ggml_vk_filterByVendor(devices, device); ggml_vk_filterByVendor(devices, name);
} else if (device != "gpu") { } else if (name != "gpu") {
ggml_vk_filterByName(devices, device); ggml_vk_filterByName(devices, name);
} }
return !devices.empty() && ggml_vk_init_device_idx(devices[0].index); if (devices.empty())
}
bool ggml_vk_init_device(size_t memoryRequired, const char * device) {
return ggml_vk_init_device(memoryRequired, std::string(device));
}
bool ggml_vk_init_device_idx(int device) {
komputeManager()->initializeDevice(device, {},
{"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
"VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"});
return ggml_vk_has_device();
}
bool ggml_vk_free_device() {
if (!ggml_vk_has_device())
return false; return false;
komputeManager.destroy();
// FIXME: The lifetime of these two needs to be tied together as we're relying upon the fact *device = devices.front();
// the llama_free(ctx) destroys this memory and we just set the singleton to nullptr here which
// is very brittle
s_kompute_context = nullptr;
return true; return true;
} }
bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const char * name) {
return ggml_vk_get_device(device, memoryRequired, std::string(name));
}
bool ggml_vk_has_vulkan() { bool ggml_vk_has_vulkan() {
return komputeManager()->hasVulkan(); return komputeManager()->hasVulkan();
} }
@ -315,10 +312,6 @@ bool ggml_vk_has_device() {
return komputeManager()->hasDevice(); return komputeManager()->hasDevice();
} }
bool ggml_vk_using_vulkan() {
return s_kompute_context != nullptr;
}
ggml_vk_device ggml_vk_current_device() { ggml_vk_device ggml_vk_current_device() {
if (!komputeManager()->hasDevice()) if (!komputeManager()->hasDevice())
return ggml_vk_device(); return ggml_vk_device();
@ -328,20 +321,6 @@ ggml_vk_device ggml_vk_current_device() {
return devices.front(); return devices.front();
} }
static ggml_kompute_context * ggml_vk_init() {
GGML_ASSERT(s_kompute_context == nullptr);
s_kompute_context = new ggml_kompute_context;
return s_kompute_context;
}
static void ggml_vk_free(struct ggml_kompute_context * ctx) {
assert(ctx == s_kompute_context);
s_kompute_context = nullptr;
if (ctx != nullptr) {
delete ctx;
}
}
static static
void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t size) { void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t size) {
std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = { std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
@ -503,20 +482,22 @@ static void ggml_vk_free_memory(ggml_vk_memory &memory)
} }
} }
static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft);
static static
ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & offset) { ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & offset) {
ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer; ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
// compatibility with ggml-backend // compatibility with ggml-backend
GGML_ASSERT(buffer && buffer->buft == ggml_backend_kompute_buffer_type()); GGML_ASSERT(buffer && buffer->buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name);
ggml_vk_memory * buf_ctx = (ggml_vk_memory *) buffer->context; ggml_vk_memory * buf_ctx = static_cast<ggml_vk_memory *>(buffer->context);
const intptr_t ioffs = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(buf_ctx->data); const intptr_t ioffs = intptr_t(t->data) - intptr_t(buf_ctx->data);
GGML_ASSERT(ioffs >= 0 && ioffs + (int64_t)ggml_nbytes(t) <= (int64_t)buffer->size); GGML_ASSERT(ioffs >= 0 && ioffs + int64_t(ggml_nbytes(t)) <= int64_t(buffer->size));
offset = (uint64_t)ioffs; offset = uint64_t(ioffs);
return buf_ctx; return buf_ctx;
} }
@ -1746,9 +1727,47 @@ kp::TensorT<uint8_t>::dataType()
// backend interface // backend interface
struct ggml_backend_kompute_buffer_type_context {
int device;
int device_ref = 0;
uint64_t buffer_alignment;
std::string name;
ggml_backend_kompute_buffer_type_context(int device, uint64_t buffer_alignment)
: device(device), buffer_alignment(buffer_alignment), name(ggml_kompute_format_name(device)) {}
};
static void ggml_backend_kompute_device_ref(ggml_backend_buffer_type_t buft) {
auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
if (!ctx->device_ref) {
komputeManager()->initializeDevice(
ctx->device, {}, {
"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
"VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"
}
);
}
assert(ggml_vk_has_device());
ctx->device_ref++;
}
static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) {
auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
assert(ctx->device_ref > 0);
ctx->device_ref--;
if (!ctx->device_ref) {
komputeManager.destroy();
}
}
static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) { static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) {
GGML_UNUSED(buffer); auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buffer->buft->context);
return "Kompute"; return ctx->name.c_str();
} }
static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) { static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@ -1808,28 +1827,19 @@ static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
// default buffer type // default buffer type
static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft) { static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
GGML_UNUSED(buft); auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
return "Kompute"; return ctx->name.c_str();
} }
static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
ggml_backend_kompute_device_ref(buft);
auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size)); auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size));
return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size); return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size);
} }
static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
GGML_UNUSED(buft); auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
return ctx->buffer_alignment;
static size_t minStorageBufferOffsetAlignment = 0;
if (minStorageBufferOffsetAlignment == 0) {
GGML_ASSERT(ggml_vk_has_device());
vk::PhysicalDeviceProperties deviceProperties;
deviceProperties = komputeManager()->physicalDevice()->getProperties();
vk::PhysicalDeviceLimits deviceLimits = deviceProperties.limits;
minStorageBufferOffsetAlignment = deviceLimits.minStorageBufferOffsetAlignment;
}
return minStorageBufferOffsetAlignment;
} }
static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
@ -1837,42 +1847,62 @@ static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffe
return ggml_backend_is_kompute(backend); return ggml_backend_is_kompute(backend);
} }
ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type() { static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_kompute = {
/* .iface = */ {
/* .get_name = */ ggml_backend_kompute_buffer_type_get_name, /* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend, /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
/* .is_host = */ NULL, /* .is_host = */ NULL,
},
/* .context = */ NULL,
}; };
return &ggml_backend_buffer_type_kompute; ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
static std::vector<ggml_backend_buffer_type> bufts = []() {
std::vector<ggml_backend_buffer_type> vec;
auto devices = ggml_vk_available_devices_internal(0);
vec.reserve(devices.size());
for (const auto & dev : devices) {
vec.push_back({
/* .iface = */ ggml_backend_kompute_buffer_type_interface,
/* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment)
});
}
return vec;
}();
auto it = std::find_if(bufts.begin(), bufts.end(), [device](const ggml_backend_buffer_type & t) {
return device == static_cast<ggml_backend_kompute_buffer_type_context *>(t.context)->device;
});
return it < bufts.end() ? &*it : nullptr;
} }
// backend // backend
static const char * ggml_backend_kompute_name(ggml_backend_t backend) { static const char * ggml_backend_kompute_name(ggml_backend_t backend) {
GGML_UNUSED(backend); auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
return "Kompute"; return ctx->name.c_str();
} }
static void ggml_backend_kompute_free(ggml_backend_t backend) { static void ggml_backend_kompute_free(ggml_backend_t backend) {
struct ggml_kompute_context * ctx = (struct ggml_kompute_context *)backend->context; auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
ggml_vk_free(ctx);
assert(ctx == s_kompute_context);
s_kompute_context = nullptr;
if (ctx != nullptr) {
delete ctx;
}
delete backend; delete backend;
} }
static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) { static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) {
GGML_UNUSED(backend); auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
return ggml_backend_kompute_buffer_type(); return ggml_backend_kompute_buffer_type(ctx->device);
} }
static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
auto * ctx = (ggml_kompute_context *)backend->context; auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
ggml_vk_graph_compute(ctx, cgraph); ggml_vk_graph_compute(ctx, cgraph);
return true; return true;
} }
@ -1897,17 +1927,13 @@ static struct ggml_backend_i kompute_backend_i = {
/* .supports_op = */ ggml_backend_kompute_supports_op, /* .supports_op = */ ggml_backend_kompute_supports_op,
}; };
ggml_backend_t ggml_backend_kompute_init() { ggml_backend_t ggml_backend_kompute_init(int device) {
if (!ggml_vk_has_device()) { GGML_ASSERT(s_kompute_context == nullptr);
fprintf(stderr, "%s: error: device was not initialized\n", __func__); s_kompute_context = new ggml_kompute_context(device);
return nullptr;
}
struct ggml_kompute_context * ctx = ggml_vk_init();
ggml_backend_t kompute_backend = new ggml_backend { ggml_backend_t kompute_backend = new ggml_backend {
/* .interface = */ kompute_backend_i, /* .interface = */ kompute_backend_i,
/* .context = */ ctx, /* .context = */ s_kompute_context,
}; };
return kompute_backend; return kompute_backend;
@ -1917,10 +1943,22 @@ bool ggml_backend_is_kompute(ggml_backend_t backend) {
return backend && backend->iface.get_name == ggml_backend_kompute_name; return backend && backend->iface.get_name == ggml_backend_kompute_name;
} }
extern "C" ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data); static ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) {
ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) {
GGML_UNUSED(params); GGML_UNUSED(params);
GGML_UNUSED(user_data); return ggml_backend_kompute_init(intptr_t(user_data));
return ggml_backend_kompute_init(); }
extern "C" int ggml_backend_kompute_reg_devices();
int ggml_backend_kompute_reg_devices() {
auto devices = ggml_vk_available_devices_internal(0);
for (const auto & device : devices) {
ggml_backend_register(
ggml_kompute_format_name(device.index).c_str(),
ggml_backend_reg_kompute_init,
ggml_backend_kompute_buffer_type(device.index),
reinterpret_cast<void *>(intptr_t(device.index))
);
}
return devices.size();
} }

View file

@ -5,6 +5,7 @@
#include <stdbool.h> #include <stdbool.h>
#include <stddef.h> #include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -17,15 +18,13 @@ struct ggml_vk_device {
const char * name; const char * name;
const char * vendor; const char * vendor;
int subgroupSize; int subgroupSize;
uint64_t bufferAlignment;
}; };
struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count); struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
bool ggml_vk_init_device(size_t memoryRequired, const char * device); bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
bool ggml_vk_init_device_idx(int device);
bool ggml_vk_free_device(void);
bool ggml_vk_has_vulkan(void); bool ggml_vk_has_vulkan(void);
bool ggml_vk_has_device(void); bool ggml_vk_has_device(void);
bool ggml_vk_using_vulkan(void);
struct ggml_vk_device ggml_vk_current_device(void); struct ggml_vk_device ggml_vk_current_device(void);
// //
@ -35,11 +34,11 @@ struct ggml_vk_device ggml_vk_current_device(void);
// forward declaration // forward declaration
typedef struct ggml_backend * ggml_backend_t; typedef struct ggml_backend * ggml_backend_t;
GGML_API ggml_backend_t ggml_backend_kompute_init(void); GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend); GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void); GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -1280,7 +1280,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
buft = ggml_backend_opencl_buffer_type(); buft = ggml_backend_opencl_buffer_type();
#elif defined(GGML_USE_KOMPUTE) #elif defined(GGML_USE_KOMPUTE)
buft = ggml_backend_kompute_buffer_type(); buft = ggml_backend_kompute_buffer_type(gpu);
if (buft == nullptr) {
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
}
#endif #endif
if (buft == nullptr) { if (buft == nullptr) {
@ -9860,13 +9863,6 @@ void llama_backend_init(bool numa) {
#ifdef GGML_USE_MPI #ifdef GGML_USE_MPI
ggml_mpi_backend_init(); ggml_mpi_backend_init();
#endif #endif
#ifdef GGML_USE_KOMPUTE
if (!ggml_vk_has_device()) {
ggml_vk_init_device(0, "gpu");
}
#endif
} }
void llama_backend_free(void) { void llama_backend_free(void) {
@ -9874,10 +9870,6 @@ void llama_backend_free(void) {
ggml_mpi_backend_free(); ggml_mpi_backend_free();
#endif #endif
ggml_quantize_free(); ggml_quantize_free();
#ifdef GGML_USE_KOMPUTE
ggml_vk_free_device();
#endif
} }
int64_t llama_time_us(void) { int64_t llama_time_us(void) {
@ -10034,8 +10026,8 @@ struct llama_context * llama_new_context_with_model(
} }
} }
#elif defined(GGML_USE_KOMPUTE) #elif defined(GGML_USE_KOMPUTE)
if (ggml_vk_has_device() && model->n_gpu_layers > 0) { if (model->n_gpu_layers > 0) {
auto * backend = ggml_backend_kompute_init(); auto * backend = ggml_backend_kompute_init(model->main_gpu);
if (backend == nullptr) { if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__); LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
llama_free(ctx); llama_free(ctx);