From be7c0559d32387c26446c0e8fe844a073bf8f202 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 29 Jan 2024 12:07:35 -0500
Subject: [PATCH] kompute : better device management

---
 ggml-backend.c   |   5 +-
 ggml-kompute.cpp | 236 +++++++++++++++++++++++++++--------------------
 ggml-kompute.h   |  11 +--
 llama.cpp        |  20 ++--
 4 files changed, 150 insertions(+), 122 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index ed4260634..532d5bd28 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -346,9 +346,8 @@ GGML_CALL static void ggml_backend_registry_init(void) {
 #endif
 
 #ifdef GGML_USE_KOMPUTE
-    extern ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data);
-    extern ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void);
-    ggml_backend_register("Kompute", ggml_backend_reg_kompute_init, ggml_backend_kompute_buffer_type(), NULL);
+    extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
+    ggml_backend_kompute_reg_devices();
 #endif
 }
 
diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index f6bba6838..270b91010 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -60,8 +60,18 @@
 #define QK_NL 16
 
 typedef ggml_fp16_t half;
+
+static std::string ggml_kompute_format_name(int device) {
+    return "Kompute" + std::to_string(device);
+}
+
 struct ggml_kompute_context {
+    int device;
+    std::string name;
     std::shared_ptr<vk::DescriptorPool> pool;
+
+    ggml_kompute_context(int device)
+        : device(device), name(ggml_kompute_format_name(device)) {}
 };
 
 // FIXME: It would be good to consolidate the kompute manager and the kompute context into one object
@@ -210,6 +220,7 @@ static std::vector<ggml_vk_device> ggml_vk_available_devices_internal(size_t mem
         d.heapSize = heapSize;
         d.vendor = strdup(ggml_vk_getVendorName(properties.vendorID));
         d.subgroupSize = subgroupProperties.subgroupSize;
+        d.bufferAlignment = properties.limits.minStorageBufferOffsetAlignment;
 
         std::string name(properties.deviceName);
         size_t n_idx = ++count_by_name[name];
@@ -271,42 +282,28 @@ static void ggml_vk_filterByName(std::vector<ggml_vk_device>& devices, const std
     );
 }
 
-static bool ggml_vk_init_device(size_t memoryRequired, const std::string & device) {
-    if (device.empty())
+static bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const std::string & name) {
+    if (name.empty())
         return false;
 
     auto devices = ggml_vk_available_devices_internal(memoryRequired);
-    if (device == "amd" || device == "nvidia" || device == "intel") {
-        ggml_vk_filterByVendor(devices, device);
-    } else if (device != "gpu") {
-        ggml_vk_filterByName(devices, device);
+    if (name == "amd" || name == "nvidia" || name == "intel") {
+        ggml_vk_filterByVendor(devices, name);
+    } else if (name != "gpu") {
+        ggml_vk_filterByName(devices, name);
     }
 
-    return !devices.empty() && ggml_vk_init_device_idx(devices[0].index);
-}
-
-bool ggml_vk_init_device(size_t memoryRequired, const char * device) {
-    return ggml_vk_init_device(memoryRequired, std::string(device));
-}
-
-bool ggml_vk_init_device_idx(int device) {
-    komputeManager()->initializeDevice(device, {},
-                         {"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
-                          "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"});
-    return ggml_vk_has_device();
-}
-
-bool ggml_vk_free_device() {
-    if (!ggml_vk_has_device())
+    if (devices.empty())
         return false;
-    komputeManager.destroy();
-    // FIXME: The lifetime of these two needs to be tied together as we're relying upon the fact
-    // the llama_free(ctx) destroys this memory and we just set the singleton to nullptr here which
-    // is very brittle
-    s_kompute_context = nullptr;
+
+    *device = devices.front();
     return true;
 }
 
+bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const char * name) {
+    return ggml_vk_get_device(device, memoryRequired, std::string(name));
+}
+
 bool ggml_vk_has_vulkan() {
     return komputeManager()->hasVulkan();
 }
@@ -315,10 +312,6 @@ bool ggml_vk_has_device() {
     return komputeManager()->hasDevice();
 }
 
-bool ggml_vk_using_vulkan() {
-    return s_kompute_context != nullptr;
-}
-
 ggml_vk_device ggml_vk_current_device() {
     if (!komputeManager()->hasDevice())
         return ggml_vk_device();
@@ -328,20 +321,6 @@ ggml_vk_device ggml_vk_current_device() {
     return devices.front();
 }
 
-static ggml_kompute_context * ggml_vk_init() {
-    GGML_ASSERT(s_kompute_context == nullptr);
-    s_kompute_context = new ggml_kompute_context;
-    return s_kompute_context;
-}
-
-static void ggml_vk_free(struct ggml_kompute_context * ctx) {
-    assert(ctx == s_kompute_context);
-    s_kompute_context = nullptr;
-    if (ctx != nullptr) {
-        delete ctx;
-    }
-}
-
 static
 void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t size) {
     std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
@@ -503,20 +482,22 @@ static void ggml_vk_free_memory(ggml_vk_memory &memory)
     }
 }
 
+static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft);
+
 static
 ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & offset) {
     ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
 
     // compatibility with ggml-backend
-    GGML_ASSERT(buffer && buffer->buft == ggml_backend_kompute_buffer_type());
+    GGML_ASSERT(buffer && buffer->buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name);
 
-    ggml_vk_memory * buf_ctx = (ggml_vk_memory *) buffer->context;
+    ggml_vk_memory * buf_ctx = static_cast<ggml_vk_memory *>(buffer->context);
 
-    const intptr_t ioffs = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(buf_ctx->data);
+    const intptr_t ioffs = intptr_t(t->data) - intptr_t(buf_ctx->data);
 
-    GGML_ASSERT(ioffs >= 0 && ioffs + (int64_t)ggml_nbytes(t) <= (int64_t)buffer->size);
+    GGML_ASSERT(ioffs >= 0 && ioffs + int64_t(ggml_nbytes(t)) <= int64_t(buffer->size));
 
-    offset = (uint64_t)ioffs;
+    offset = uint64_t(ioffs);
     return buf_ctx;
 }
 
@@ -1746,9 +1727,47 @@ kp::TensorT<uint8_t>::dataType()
 
 // backend interface
 
+struct ggml_backend_kompute_buffer_type_context {
+    int         device;
+    int         device_ref = 0;
+    uint64_t    buffer_alignment;
+    std::string name;
+
+    ggml_backend_kompute_buffer_type_context(int device, uint64_t buffer_alignment)
+        : device(device), buffer_alignment(buffer_alignment), name(ggml_kompute_format_name(device)) {}
+};
+
+static void ggml_backend_kompute_device_ref(ggml_backend_buffer_type_t buft) {
+    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
+
+    if (!ctx->device_ref) {
+        komputeManager()->initializeDevice(
+            ctx->device, {}, {
+                "VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
+                "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"
+            }
+        );
+    }
+
+    assert(ggml_vk_has_device());
+    ctx->device_ref++;
+}
+
+static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) {
+    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
+
+    assert(ctx->device_ref > 0);
+
+    ctx->device_ref--;
+
+    if (!ctx->device_ref) {
+        komputeManager.destroy();
+    }
+}
+
 static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) {
-    GGML_UNUSED(buffer);
-    return "Kompute";
+    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buffer->buft->context);
+    return ctx->name.c_str();
 }
 
 static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -1808,28 +1827,19 @@ static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
 // default buffer type
 
 static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(buft);
-    return "Kompute";
+    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
+    return ctx->name.c_str();
 }
 
 static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    ggml_backend_kompute_device_ref(buft);
     auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size));
     return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size);
 }
 
 static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(buft);
-
-    static size_t minStorageBufferOffsetAlignment = 0;
-    if (minStorageBufferOffsetAlignment == 0) {
-        GGML_ASSERT(ggml_vk_has_device());
-        vk::PhysicalDeviceProperties deviceProperties;
-        deviceProperties = komputeManager()->physicalDevice()->getProperties();
-        vk::PhysicalDeviceLimits deviceLimits = deviceProperties.limits;
-        minStorageBufferOffsetAlignment = deviceLimits.minStorageBufferOffsetAlignment;
-    }
-
-    return minStorageBufferOffsetAlignment;
+    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
+    return ctx->buffer_alignment;
 }
 
 static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
@@ -1837,42 +1847,62 @@ static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffe
     return ggml_backend_is_kompute(backend);
 }
 
-ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_kompute = {
-        /* .iface = */ {
-            /* .get_name         = */ ggml_backend_kompute_buffer_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_kompute_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_kompute_buffer_type_get_alignment,
-            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
-            /* .is_host          = */ NULL,
-        },
-        /* .context = */ NULL,
-    };
+static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_kompute_buffer_type_get_name,
+    /* .alloc_buffer     = */ ggml_backend_kompute_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_kompute_buffer_type_get_alignment,
+    /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+    /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
+    /* .is_host          = */ NULL,
+};
 
-    return &ggml_backend_buffer_type_kompute;
+ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
+    static std::vector<ggml_backend_buffer_type> bufts = []() {
+        std::vector<ggml_backend_buffer_type> vec;
+        auto devices = ggml_vk_available_devices_internal(0);
+        vec.reserve(devices.size());
+
+        for (const auto & dev : devices) {
+            vec.push_back({
+                /* .iface   = */ ggml_backend_kompute_buffer_type_interface,
+                /* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment)
+            });
+        }
+        return vec;
+    }();
+
+    auto it = std::find_if(bufts.begin(), bufts.end(), [device](const ggml_backend_buffer_type & t) { 
+        return device == static_cast<ggml_backend_kompute_buffer_type_context *>(t.context)->device;
+    });
+    return it < bufts.end() ? &*it : nullptr;
 }
 
 // backend
 
 static const char * ggml_backend_kompute_name(ggml_backend_t backend) {
-    GGML_UNUSED(backend);
-    return "Kompute";
+    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
+    return ctx->name.c_str();
 }
 
 static void ggml_backend_kompute_free(ggml_backend_t backend) {
-    struct ggml_kompute_context * ctx = (struct ggml_kompute_context *)backend->context;
-    ggml_vk_free(ctx);
+    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
+
+    assert(ctx == s_kompute_context);
+    s_kompute_context = nullptr;
+    if (ctx != nullptr) {
+        delete ctx;
+    }
+
     delete backend;
 }
 
 static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) {
-    GGML_UNUSED(backend);
-    return ggml_backend_kompute_buffer_type();
+    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
+    return ggml_backend_kompute_buffer_type(ctx->device);
 }
 
 static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    auto * ctx = (ggml_kompute_context *)backend->context;
+    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
     ggml_vk_graph_compute(ctx, cgraph);
     return true;
 }
@@ -1897,17 +1927,13 @@ static struct ggml_backend_i kompute_backend_i = {
     /* .supports_op             = */ ggml_backend_kompute_supports_op,
 };
 
-ggml_backend_t ggml_backend_kompute_init() {
-    if (!ggml_vk_has_device()) {
-        fprintf(stderr, "%s: error: device was not initialized\n", __func__);
-        return nullptr;
-    }
-
-    struct ggml_kompute_context * ctx = ggml_vk_init();
+ggml_backend_t ggml_backend_kompute_init(int device) {
+    GGML_ASSERT(s_kompute_context == nullptr);
+    s_kompute_context = new ggml_kompute_context(device);
 
     ggml_backend_t kompute_backend = new ggml_backend {
         /* .interface = */ kompute_backend_i,
-        /* .context   = */ ctx,
+        /* .context   = */ s_kompute_context,
     };
 
     return kompute_backend;
@@ -1917,10 +1943,22 @@ bool ggml_backend_is_kompute(ggml_backend_t backend) {
     return backend && backend->iface.get_name == ggml_backend_kompute_name;
 }
 
-extern "C" ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data);
-
-ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) {
+static ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) {
     GGML_UNUSED(params);
-    GGML_UNUSED(user_data);
-    return ggml_backend_kompute_init();
+    return ggml_backend_kompute_init(intptr_t(user_data));
+}
+
+extern "C" int ggml_backend_kompute_reg_devices();
+
+int ggml_backend_kompute_reg_devices() {
+    auto devices = ggml_vk_available_devices_internal(0);
+    for (const auto & device : devices) {
+        ggml_backend_register(
+            ggml_kompute_format_name(device.index).c_str(),
+            ggml_backend_reg_kompute_init,
+            ggml_backend_kompute_buffer_type(device.index),
+            reinterpret_cast<void *>(intptr_t(device.index))
+        );
+    }
+    return devices.size();
 }
diff --git a/ggml-kompute.h b/ggml-kompute.h
index d4aeb7731..c56e42f8e 100644
--- a/ggml-kompute.h
+++ b/ggml-kompute.h
@@ -5,6 +5,7 @@
 
 #include <stdbool.h>
 #include <stddef.h>
+#include <stdint.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -17,15 +18,13 @@ struct ggml_vk_device {
     const char * name;
     const char * vendor;
     int subgroupSize;
+    uint64_t bufferAlignment;
 };
 
 struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
-bool ggml_vk_init_device(size_t memoryRequired, const char * device);
-bool ggml_vk_init_device_idx(int device);
-bool ggml_vk_free_device(void);
+bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
 bool ggml_vk_has_vulkan(void);
 bool ggml_vk_has_device(void);
-bool ggml_vk_using_vulkan(void);
 struct ggml_vk_device ggml_vk_current_device(void);
 
 //
@@ -35,11 +34,11 @@ struct ggml_vk_device ggml_vk_current_device(void);
 // forward declaration
 typedef struct ggml_backend * ggml_backend_t;
 
-GGML_API ggml_backend_t ggml_backend_kompute_init(void);
+GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
 
 GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
 
-GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void);
+GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
 
 #ifdef __cplusplus
 }
diff --git a/llama.cpp b/llama.cpp
index b97d4d960..9605b5007 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1280,7 +1280,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
 #elif defined(GGML_USE_CLBLAST)
     buft = ggml_backend_opencl_buffer_type();
 #elif defined(GGML_USE_KOMPUTE)
-    buft = ggml_backend_kompute_buffer_type();
+    buft = ggml_backend_kompute_buffer_type(gpu);
+    if (buft == nullptr) {
+        LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
+    }
 #endif
 
     if (buft == nullptr) {
@@ -9860,13 +9863,6 @@ void llama_backend_init(bool numa) {
 #ifdef GGML_USE_MPI
     ggml_mpi_backend_init();
 #endif
-
-#ifdef GGML_USE_KOMPUTE
-    if (!ggml_vk_has_device()) {
-        ggml_vk_init_device(0, "gpu");
-    }
-#endif
-
 }
 
 void llama_backend_free(void) {
@@ -9874,10 +9870,6 @@ void llama_backend_free(void) {
     ggml_mpi_backend_free();
 #endif
     ggml_quantize_free();
-
-#ifdef GGML_USE_KOMPUTE
-    ggml_vk_free_device();
-#endif
 }
 
 int64_t llama_time_us(void) {
@@ -10034,8 +10026,8 @@ struct llama_context * llama_new_context_with_model(
             }
         }
 #elif defined(GGML_USE_KOMPUTE)
-        if (ggml_vk_has_device() && model->n_gpu_layers > 0) {
-            auto * backend = ggml_backend_kompute_init();
+        if (model->n_gpu_layers > 0) {
+            auto * backend = ggml_backend_kompute_init(model->main_gpu);
             if (backend == nullptr) {
                 LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
                 llama_free(ctx);