kompute : better device management

2024-01-29 12:07:35 -05:00 · 2024-01-29 12:07:35 -05:00 · be7c0559d3
commit be7c0559d3
parent 530462550d
4 changed files with 150 additions and 122 deletions
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -346,9 +346,8 @@ GGML_CALL static void ggml_backend_registry_init(void) {
 #endif
 #ifdef GGML_USE_KOMPUTE
-    extern ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data);
+    extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
-    extern ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void);
+    ggml_backend_kompute_reg_devices();
    ggml_backend_register("Kompute", ggml_backend_reg_kompute_init, ggml_backend_kompute_buffer_type(), NULL);
 #endif
 }
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@ -60,8 +60,18 @@
 #define QK_NL 16
 typedef ggml_fp16_t half;
 static std::string ggml_kompute_format_name(int device) {
    return "Kompute" + std::to_string(device);
 }
 struct ggml_kompute_context {
    int device;
    std::string name;
    std::shared_ptr<vk::DescriptorPool> pool;
    ggml_kompute_context(int device)
        : device(device), name(ggml_kompute_format_name(device)) {}
 };
 // FIXME: It would be good to consolidate the kompute manager and the kompute context into one object
@ -210,6 +220,7 @@ static std::vector<ggml_vk_device> ggml_vk_available_devices_internal(size_t mem
        d.heapSize = heapSize;
        d.vendor = strdup(ggml_vk_getVendorName(properties.vendorID));
        d.subgroupSize = subgroupProperties.subgroupSize;
        d.bufferAlignment = properties.limits.minStorageBufferOffsetAlignment;
        std::string name(properties.deviceName);
        size_t n_idx = ++count_by_name[name];
@ -271,42 +282,28 @@ static void ggml_vk_filterByName(std::vector<ggml_vk_device>& devices, const std
    );
 }
-static bool ggml_vk_init_device(size_t memoryRequired, const std::string & device) {
+static bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const std::string & name) {
-    if (device.empty())
+    if (name.empty())
        return false;
    auto devices = ggml_vk_available_devices_internal(memoryRequired);
-    if (device == "amd" || device == "nvidia" || device == "intel") {
+    if (name == "amd" || name == "nvidia" || name == "intel") {
-        ggml_vk_filterByVendor(devices, device);
+        ggml_vk_filterByVendor(devices, name);
-    } else if (device != "gpu") {
+    } else if (name != "gpu") {
-        ggml_vk_filterByName(devices, device);
+        ggml_vk_filterByName(devices, name);
    }
-    return !devices.empty() && ggml_vk_init_device_idx(devices[0].index);
+    if (devices.empty())
 }
 bool ggml_vk_init_device(size_t memoryRequired, const char * device) {
    return ggml_vk_init_device(memoryRequired, std::string(device));
 }
 bool ggml_vk_init_device_idx(int device) {
    komputeManager()->initializeDevice(device, {},
                         {"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
                          "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"});
    return ggml_vk_has_device();
 }
 bool ggml_vk_free_device() {
    if (!ggml_vk_has_device())
        return false;
-    komputeManager.destroy();
+
-    // FIXME: The lifetime of these two needs to be tied together as we're relying upon the fact
+    *device = devices.front();
    // the llama_free(ctx) destroys this memory and we just set the singleton to nullptr here which
    // is very brittle
    s_kompute_context = nullptr;
    return true;
 }
 bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const char * name) {
    return ggml_vk_get_device(device, memoryRequired, std::string(name));
 }
 bool ggml_vk_has_vulkan() {
    return komputeManager()->hasVulkan();
 }
@ -315,10 +312,6 @@ bool ggml_vk_has_device() {
    return komputeManager()->hasDevice();
 }
 bool ggml_vk_using_vulkan() {
    return s_kompute_context != nullptr;
 }
 ggml_vk_device ggml_vk_current_device() {
    if (!komputeManager()->hasDevice())
        return ggml_vk_device();
@ -328,20 +321,6 @@ ggml_vk_device ggml_vk_current_device() {
    return devices.front();
 }
 static ggml_kompute_context * ggml_vk_init() {
    GGML_ASSERT(s_kompute_context == nullptr);
    s_kompute_context = new ggml_kompute_context;
    return s_kompute_context;
 }
 static void ggml_vk_free(struct ggml_kompute_context * ctx) {
    assert(ctx == s_kompute_context);
    s_kompute_context = nullptr;
    if (ctx != nullptr) {
        delete ctx;
    }
 }
 static
 void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t size) {
    std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
@ -503,20 +482,22 @@ static void ggml_vk_free_memory(ggml_vk_memory &memory)
    }
 }
 static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft);
 static
 ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & offset) {
    ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
    // compatibility with ggml-backend
-    GGML_ASSERT(buffer && buffer->buft == ggml_backend_kompute_buffer_type());
+    GGML_ASSERT(buffer && buffer->buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name);
-    ggml_vk_memory * buf_ctx = (ggml_vk_memory *) buffer->context;
+    ggml_vk_memory * buf_ctx = static_cast<ggml_vk_memory *>(buffer->context);
-    const intptr_t ioffs = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(buf_ctx->data);
+    const intptr_t ioffs = intptr_t(t->data) - intptr_t(buf_ctx->data);
-    GGML_ASSERT(ioffs >= 0 && ioffs + (int64_t)ggml_nbytes(t) <= (int64_t)buffer->size);
+    GGML_ASSERT(ioffs >= 0 && ioffs + int64_t(ggml_nbytes(t)) <= int64_t(buffer->size));
-    offset = (uint64_t)ioffs;
+    offset = uint64_t(ioffs);
    return buf_ctx;
 }
@ -1746,9 +1727,47 @@ kp::TensorT<uint8_t>::dataType()
 // backend interface
 struct ggml_backend_kompute_buffer_type_context {
    int         device;
    int         device_ref = 0;
    uint64_t    buffer_alignment;
    std::string name;
    ggml_backend_kompute_buffer_type_context(int device, uint64_t buffer_alignment)
        : device(device), buffer_alignment(buffer_alignment), name(ggml_kompute_format_name(device)) {}
 };
 static void ggml_backend_kompute_device_ref(ggml_backend_buffer_type_t buft) {
    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
    if (!ctx->device_ref) {
        komputeManager()->initializeDevice(
            ctx->device, {}, {
                "VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
                "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"
            }
        );
    }
    assert(ggml_vk_has_device());
    ctx->device_ref++;
 }
 static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) {
    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
    assert(ctx->device_ref > 0);
    ctx->device_ref--;
    if (!ctx->device_ref) {
        komputeManager.destroy();
    }
 }
 static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) {
-    GGML_UNUSED(buffer);
+    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buffer->buft->context);
-    return "Kompute";
+    return ctx->name.c_str();
 }
 static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@ -1808,28 +1827,19 @@ static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
 // default buffer type
 static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(buft);
+    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
-    return "Kompute";
+    return ctx->name.c_str();
 }
 static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    ggml_backend_kompute_device_ref(buft);
    auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size));
    return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size);
 }
 static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(buft);
+    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
-
+    return ctx->buffer_alignment;
    static size_t minStorageBufferOffsetAlignment = 0;
    if (minStorageBufferOffsetAlignment == 0) {
        GGML_ASSERT(ggml_vk_has_device());
        vk::PhysicalDeviceProperties deviceProperties;
        deviceProperties = komputeManager()->physicalDevice()->getProperties();
        vk::PhysicalDeviceLimits deviceLimits = deviceProperties.limits;
        minStorageBufferOffsetAlignment = deviceLimits.minStorageBufferOffsetAlignment;
    }
    return minStorageBufferOffsetAlignment;
 }
 static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
@ -1837,42 +1847,62 @@ static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffe
    return ggml_backend_is_kompute(backend);
 }
-ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type() {
+static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
    static struct ggml_backend_buffer_type ggml_backend_buffer_type_kompute = {
        /* .iface = */ {
    /* .get_name         = */ ggml_backend_kompute_buffer_type_get_name,
    /* .alloc_buffer     = */ ggml_backend_kompute_buffer_type_alloc_buffer,
    /* .get_alignment    = */ ggml_backend_kompute_buffer_type_get_alignment,
    /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
    /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
    /* .is_host          = */ NULL,
        },
        /* .context = */ NULL,
 };
-    return &ggml_backend_buffer_type_kompute;
+ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
    static std::vector<ggml_backend_buffer_type> bufts = []() {
        std::vector<ggml_backend_buffer_type> vec;
        auto devices = ggml_vk_available_devices_internal(0);
        vec.reserve(devices.size());
        for (const auto & dev : devices) {
            vec.push_back({
                /* .iface   = */ ggml_backend_kompute_buffer_type_interface,
                /* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment)
            });
        }
        return vec;
    }();
    auto it = std::find_if(bufts.begin(), bufts.end(), [device](const ggml_backend_buffer_type & t) { 
        return device == static_cast<ggml_backend_kompute_buffer_type_context *>(t.context)->device;
    });
    return it < bufts.end() ? &*it : nullptr;
 }
 // backend
 static const char * ggml_backend_kompute_name(ggml_backend_t backend) {
-    GGML_UNUSED(backend);
+    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
-    return "Kompute";
+    return ctx->name.c_str();
 }
 static void ggml_backend_kompute_free(ggml_backend_t backend) {
-    struct ggml_kompute_context * ctx = (struct ggml_kompute_context *)backend->context;
+    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
-    ggml_vk_free(ctx);
+
    assert(ctx == s_kompute_context);
    s_kompute_context = nullptr;
    if (ctx != nullptr) {
        delete ctx;
    }
    delete backend;
 }
 static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) {
-    GGML_UNUSED(backend);
+    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
-    return ggml_backend_kompute_buffer_type();
+    return ggml_backend_kompute_buffer_type(ctx->device);
 }
 static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    auto * ctx = (ggml_kompute_context *)backend->context;
+    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
    ggml_vk_graph_compute(ctx, cgraph);
    return true;
 }
@ -1897,17 +1927,13 @@ static struct ggml_backend_i kompute_backend_i = {
    /* .supports_op             = */ ggml_backend_kompute_supports_op,
 };
-ggml_backend_t ggml_backend_kompute_init() {
+ggml_backend_t ggml_backend_kompute_init(int device) {
-    if (!ggml_vk_has_device()) {
+    GGML_ASSERT(s_kompute_context == nullptr);
-        fprintf(stderr, "%s: error: device was not initialized\n", __func__);
+    s_kompute_context = new ggml_kompute_context(device);
        return nullptr;
    }
    struct ggml_kompute_context * ctx = ggml_vk_init();
    ggml_backend_t kompute_backend = new ggml_backend {
        /* .interface = */ kompute_backend_i,
-        /* .context   = */ ctx,
+        /* .context   = */ s_kompute_context,
    };
    return kompute_backend;
@ -1917,10 +1943,22 @@ bool ggml_backend_is_kompute(ggml_backend_t backend) {
    return backend && backend->iface.get_name == ggml_backend_kompute_name;
 }
-extern "C" ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data);
+static ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) {
 ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) {
    GGML_UNUSED(params);
-    GGML_UNUSED(user_data);
+    return ggml_backend_kompute_init(intptr_t(user_data));
-    return ggml_backend_kompute_init();
+}
 extern "C" int ggml_backend_kompute_reg_devices();
 int ggml_backend_kompute_reg_devices() {
    auto devices = ggml_vk_available_devices_internal(0);
    for (const auto & device : devices) {
        ggml_backend_register(
            ggml_kompute_format_name(device.index).c_str(),
            ggml_backend_reg_kompute_init,
            ggml_backend_kompute_buffer_type(device.index),
            reinterpret_cast<void *>(intptr_t(device.index))
        );
    }
    return devices.size();
 }
--- a/ggml-kompute.h
+++ b/ggml-kompute.h
@ -5,6 +5,7 @@
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #ifdef __cplusplus
 extern "C" {
@ -17,15 +18,13 @@ struct ggml_vk_device {
    const char * name;
    const char * vendor;
    int subgroupSize;
    uint64_t bufferAlignment;
 };
 struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
-bool ggml_vk_init_device(size_t memoryRequired, const char * device);
+bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
 bool ggml_vk_init_device_idx(int device);
 bool ggml_vk_free_device(void);
 bool ggml_vk_has_vulkan(void);
 bool ggml_vk_has_device(void);
 bool ggml_vk_using_vulkan(void);
 struct ggml_vk_device ggml_vk_current_device(void);
 //
@ -35,11 +34,11 @@ struct ggml_vk_device ggml_vk_current_device(void);
 // forward declaration
 typedef struct ggml_backend * ggml_backend_t;
-GGML_API ggml_backend_t ggml_backend_kompute_init(void);
+GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
 GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
-GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void);
+GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
 #ifdef __cplusplus
 }
--- a/llama.cpp
+++ b/llama.cpp
@ -1280,7 +1280,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
 #elif defined(GGML_USE_CLBLAST)
    buft = ggml_backend_opencl_buffer_type();
 #elif defined(GGML_USE_KOMPUTE)
-    buft = ggml_backend_kompute_buffer_type();
+    buft = ggml_backend_kompute_buffer_type(gpu);
    if (buft == nullptr) {
        LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
    }
 #endif
    if (buft == nullptr) {
@ -9860,13 +9863,6 @@ void llama_backend_init(bool numa) {
 #ifdef GGML_USE_MPI
    ggml_mpi_backend_init();
 #endif
 #ifdef GGML_USE_KOMPUTE
    if (!ggml_vk_has_device()) {
        ggml_vk_init_device(0, "gpu");
    }
 #endif
 }
 void llama_backend_free(void) {
@ -9874,10 +9870,6 @@ void llama_backend_free(void) {
    ggml_mpi_backend_free();
 #endif
    ggml_quantize_free();
 #ifdef GGML_USE_KOMPUTE
    ggml_vk_free_device();
 #endif
 }
 int64_t llama_time_us(void) {
@ -10034,8 +10026,8 @@ struct llama_context * llama_new_context_with_model(
            }
        }
 #elif defined(GGML_USE_KOMPUTE)
-        if (ggml_vk_has_device() && model->n_gpu_layers > 0) {
+        if (model->n_gpu_layers > 0) {
-            auto * backend = ggml_backend_kompute_init();
+            auto * backend = ggml_backend_kompute_init(model->main_gpu);
            if (backend == nullptr) {
                LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
                llama_free(ctx);