From f24ed14ee0ce28dfe98115c378b37da144912016 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 19 Feb 2024 15:54:12 -0500
Subject: [PATCH 01/17] make : pass CPPFLAGS directly to nvcc, not via
 -Xcompiler (#5598)

---
 Makefile | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index 63b4af9ba..db5df1b32 100644
--- a/Makefile
+++ b/Makefile
@@ -446,9 +446,9 @@ ifdef LLAMA_CUDA_CCBIN
 endif
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 ifdef JETSON_EOL_MODULE_DETECT
-	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 else
-	$(NVCC) $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endif # JETSON_EOL_MODULE_DETECT
 endif # LLAMA_CUBLAS
 
@@ -549,9 +549,10 @@ GF_CC := $(CC)
 include scripts/get-flags.mk
 
 # combine build flags with cmdline overrides
-override CFLAGS    := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
-BASE_CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
-override CXXFLAGS  := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS)
+override CPPFLAGS  := $(MK_CPPFLAGS) $(CPPFLAGS)
+override CFLAGS    := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
+BASE_CXXFLAGS      := $(MK_CXXFLAGS) $(CXXFLAGS)
+override CXXFLAGS  := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
 override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
 override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)
 

From 40c3a6c1e11040088b4a1ce0abc4651cb3011dd4 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Mon, 19 Feb 2024 23:40:26 +0100
Subject: [PATCH 02/17] cuda : ignore peer access already enabled errors
 (#5597)

* cuda : ignore peer access already enabled errors

* fix hip
---
 ggml-cuda.cu | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index e091dbdc1..6caae56b0 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -54,6 +54,8 @@
 #define cudaDeviceProp hipDeviceProp_t
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
+#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
+#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
 #define cudaEventCreateWithFlags hipEventCreateWithFlags
 #define cudaEventDisableTiming hipEventDisableTiming
 #define cudaEventRecord hipEventRecord
@@ -9325,9 +9327,15 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
             CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
             if (can_access_peer) {
                 if (enable_peer_access) {
-                    CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
+                    cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
+                    if (err != cudaErrorPeerAccessAlreadyEnabled) {
+                        CUDA_CHECK(err);
+                    }
                 } else {
-                    CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
+                    cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
+                    if (err != cudaErrorPeerAccessNotEnabled) {
+                        CUDA_CHECK(err);
+                    }
                 }
             }
         }
@@ -10999,10 +11007,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backe
     UNUSED(buffer);
 }
 
-// unused at the moment
-//static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
-//    return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
-//}
+static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
+    return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
+    UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
+}
 
 GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
@@ -11390,7 +11398,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             if (node->src[j] != nullptr) {
                 assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
-                assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
+                assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
                 assert(node->src[j]->extra != nullptr);
             }
         }

From 5dde5408978eda22242b87e22e306d1c2d1a5834 Mon Sep 17 00:00:00 2001
From: Mathijs de Bruin <mathijs@mathijsfietst.nl>
Date: Sat, 3 Feb 2024 17:56:46 +0000
Subject: [PATCH 03/17] Allow for Vulkan build with Accelerate.

Closes #5304
---
 ggml.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml.c b/ggml.c
index 4ee2c5e11..d129df505 100644
--- a/ggml.c
+++ b/ggml.c
@@ -273,6 +273,8 @@ inline static void * ggml_calloc(size_t num, size_t size) {
 #include <Accelerate/Accelerate.h>
 #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
 #include "ggml-opencl.h"
+#elif defined(GGML_USE_VULKAN)
+#include "ggml-vulkan.h"
 #endif
 #elif defined(GGML_USE_OPENBLAS)
 #if defined(GGML_BLAS_USE_MKL)

From 42f664a3825dfde13a32c3577ab66d10c56f3aa6 Mon Sep 17 00:00:00 2001
From: Mathijs de Bruin <mathijs@mathijsfietst.nl>
Date: Sat, 3 Feb 2024 18:00:11 +0000
Subject: [PATCH 04/17] Resolve ErrorIncompatibleDriver with Vulkan on MacOS.

Refs:
- https://chat.openai.com/share/7020ce72-65fc-45ec-b7be-9d9d798a5f3f
- https://github.com/SaschaWillems/Vulkan/issues/954
- https://github.com/haasn/libplacebo/issues/128
- https://github.com/KhronosGroup/Vulkan-Samples/issues/476
---
 ggml-vulkan.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 4a30414df..e9e966dbf 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1109,8 +1109,10 @@ static void ggml_vk_instance_init() {
 #ifdef GGML_VULKAN_VALIDATE
         "VK_EXT_validation_features",
 #endif
+        "VK_KHR_portability_enumeration",
     };
-    vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions);
+
+    vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR), &app_info, layers, extensions);
 #ifdef GGML_VULKAN_VALIDATE
     const std::vector<vk::ValidationFeatureEnableEXT> features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
     vk::ValidationFeaturesEXT validation_features = {

From d8c054517dc24f1316f3be12a98fff383e1e93e3 Mon Sep 17 00:00:00 2001
From: Mathijs de Bruin <mathijs@mathijsfietst.nl>
Date: Tue, 6 Feb 2024 14:39:22 +0000
Subject: [PATCH 05/17] Add preprocessor checks for Apple devices.

Based on work by @rbourgeat in https://github.com/ggerganov/llama.cpp/pull/5322/files
---
 ggml-vulkan.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index e9e966dbf..33b8a9061 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1109,10 +1109,15 @@ static void ggml_vk_instance_init() {
 #ifdef GGML_VULKAN_VALIDATE
         "VK_EXT_validation_features",
 #endif
+#ifdef __APPLE__
         "VK_KHR_portability_enumeration",
+#endif
     };
+    vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions);
+#ifdef __APPLE__
+    instance_create_info.flags = vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
+#endif
 
-    vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR), &app_info, layers, extensions);
 #ifdef GGML_VULKAN_VALIDATE
     const std::vector<vk::ValidationFeatureEnableEXT> features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
     vk::ValidationFeaturesEXT validation_features = {

From f50db6ae0bdcb5f8593ca6ca46dfa03b177faa2f Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sat, 10 Feb 2024 22:14:52 +0100
Subject: [PATCH 06/17] Add check for VK_KHR_portability_enumeration for
 MoltenVK support

---
 ggml-vulkan.cpp | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 33b8a9061..37123ac8f 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1100,23 +1100,44 @@ static void ggml_vk_instance_init() {
 #endif
 
     vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
-    const std::vector<const char*> layers = {
+
+    const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
+#ifdef __APPLE__
+    bool portability_enumeration_ext = false;
+    // Check for portability enumeration extension for MoltenVK support
+    for (const auto& properties : instance_extensions) {
+        if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
+            portability_enumeration_ext = true;
+            break;
+        }
+    }
+    if (!portability_enumeration_ext) {
+        std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
+    }
+#endif
+
+    std::vector<const char*> layers = {
 #ifdef GGML_VULKAN_VALIDATE
         "VK_LAYER_KHRONOS_validation",
 #endif
     };
-    const std::vector<const char*> extensions = {
+    std::vector<const char*> extensions = {
 #ifdef GGML_VULKAN_VALIDATE
         "VK_EXT_validation_features",
-#endif
-#ifdef __APPLE__
-        "VK_KHR_portability_enumeration",
 #endif
     };
-    vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions);
 #ifdef __APPLE__
-    instance_create_info.flags = vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
+    if (portability_enumeration_ext) {
+        extensions.push_back("VK_KHR_portability_enumeration");
+    }
 #endif
+    vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
+#ifdef __APPLE__
+    if (portability_enumeration_ext) {
+        instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
+    }
+#endif
+
 
 #ifdef GGML_VULKAN_VALIDATE
     const std::vector<vk::ValidationFeatureEnableEXT> features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
@@ -1175,12 +1196,12 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
     vk_instance.devices[idx] = std::make_shared<vk_device>();
     ctx->device = vk_instance.devices[idx];
     ctx->device.lock()->physical_device = devices[dev_num];
-    std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
+    const std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
 
     bool maintenance4_support = false;
 
     // Check if maintenance4 is supported
-    for (auto properties : ext_props) {
+    for (const auto& properties : ext_props) {
         if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
             maintenance4_support = true;
         }
@@ -1211,7 +1232,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
     bool fp16_storage = false;
     bool fp16_compute = false;
 
-    for (auto properties : ext_props) {
+    for (const auto& properties : ext_props) {
         if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
             fp16_storage = true;
         } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {

From bb9dcd560a7e81265398b0d463c40f3e467daf19 Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Wed, 14 Feb 2024 20:57:17 +0100
Subject: [PATCH 07/17] Refactor validation and enumeration platform checks
 into functions to clean up ggml_vk_instance_init()

---
 ggml-vulkan.cpp | 101 ++++++++++++++++++++++++++++++------------------
 1 file changed, 63 insertions(+), 38 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 37123ac8f..4e5eaff15 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1091,7 +1091,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
     }
 }
 
-static void ggml_vk_instance_init() {
+static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
+static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
+
+void ggml_vk_instance_init() {
     if (vk_instance_initialized) {
         return;
     }
@@ -1102,54 +1105,40 @@ static void ggml_vk_instance_init() {
     vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
 
     const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
-#ifdef __APPLE__
-    bool portability_enumeration_ext = false;
-    // Check for portability enumeration extension for MoltenVK support
-    for (const auto& properties : instance_extensions) {
-        if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
-            portability_enumeration_ext = true;
-            break;
-        }
-    }
-    if (!portability_enumeration_ext) {
-        std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
-    }
-#endif
+    const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
+    const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
 
-    std::vector<const char*> layers = {
-#ifdef GGML_VULKAN_VALIDATE
-        "VK_LAYER_KHRONOS_validation",
-#endif
-    };
-    std::vector<const char*> extensions = {
-#ifdef GGML_VULKAN_VALIDATE
-        "VK_EXT_validation_features",
-#endif
-    };
-#ifdef __APPLE__
+    std::vector<const char*> layers;
+
+    if (validation_ext) {
+        layers.push_back("VK_LAYER_KHRONOS_validation");
+    }
+    std::vector<const char*> extensions;
+    if (validation_ext) {
+        extensions.push_back("VK_EXT_validation_features");
+    }
     if (portability_enumeration_ext) {
         extensions.push_back("VK_KHR_portability_enumeration");
     }
-#endif
     vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
-#ifdef __APPLE__
     if (portability_enumeration_ext) {
         instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
     }
-#endif
 
+    std::vector<vk::ValidationFeatureEnableEXT> features_enable;
+    vk::ValidationFeaturesEXT validation_features;
 
-#ifdef GGML_VULKAN_VALIDATE
-    const std::vector<vk::ValidationFeatureEnableEXT> features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
-    vk::ValidationFeaturesEXT validation_features = {
-        features_enable,
-        {},
-    };
-    validation_features.setPNext(nullptr);
-    instance_create_info.setPNext(&validation_features);
+    if (validation_ext) {
+        features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
+        validation_features = {
+            features_enable,
+            {},
+        };
+        validation_features.setPNext(nullptr);
+        instance_create_info.setPNext(&validation_features);
 
-    std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
-#endif
+        std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
+    }
     vk_instance.instance = vk::createInstance(instance_create_info);
 
     memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
@@ -5329,6 +5318,42 @@ GGML_CALL int ggml_backend_vk_reg_devices() {
     return vk_instance.device_indices.size();
 }
 
+// Extension availability
+static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
+#ifdef GGML_VULKAN_VALIDATE
+    bool portability_enumeration_ext = false;
+    // Check for portability enumeration extension for MoltenVK support
+    for (const auto& properties : instance_extensions) {
+        if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
+            return true;
+        }
+    }
+    if (!portability_enumeration_ext) {
+        std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
+    }
+#endif
+    return false;
+
+    UNUSED(instance_extensions);
+}
+static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
+#ifdef __APPLE__
+    bool portability_enumeration_ext = false;
+    // Check for portability enumeration extension for MoltenVK support
+    for (const auto& properties : instance_extensions) {
+        if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
+            return true;
+        }
+    }
+    if (!portability_enumeration_ext) {
+        std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
+    }
+#endif
+    return false;
+
+    UNUSED(instance_extensions);
+}
+
 // checks
 
 #ifdef GGML_VULKAN_CHECK_RESULTS

From 22f83f0c383e12106692b8afc224d61b8993a52c Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sat, 10 Feb 2024 22:18:33 +0100
Subject: [PATCH 08/17] Enable Vulkan MacOS CI

---
 .devops/nix/package.nix | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index ad23f7dd7..815db6a2d 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -255,11 +255,11 @@ effectiveStdenv.mkDerivation (
       # Configurations we don't want even the CI to evaluate. Results in the
       # "unsupported platform" messages. This is mostly a no-op, because
       # cudaPackages would've refused to evaluate anyway.
-      badPlatforms = optionals (useCuda || useOpenCL || useVulkan) lib.platforms.darwin;
+      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
 
       # Configurations that are known to result in build failures. Can be
       # overridden by importing Nixpkgs with `allowBroken = true`.
-      broken = (useMetalKit && !effectiveStdenv.isDarwin) || (useVulkan && effectiveStdenv.isDarwin);
+      broken = (useMetalKit && !effectiveStdenv.isDarwin);
 
       description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
       homepage = "https://github.com/ggerganov/llama.cpp/";

From 633782b8d949f24b619e6c68ee37b5cc79167173 Mon Sep 17 00:00:00 2001
From: Mathijs de Bruin <mathijs@mathijsfietst.nl>
Date: Tue, 13 Feb 2024 20:28:02 +0000
Subject: [PATCH 09/17] nix: now that we can do so, allow MacOS to build Vulkan
 binaries

Author:    Philip Taron <philip.taron@gmail.com>
Date:      Tue Feb 13 20:28:02 2024 +0000
---
 flake.nix | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flake.nix b/flake.nix
index ad2f9b295..dc4e503c3 100644
--- a/flake.nix
+++ b/flake.nix
@@ -150,6 +150,7 @@
             packages =
               {
                 default = config.legacyPackages.llamaPackages.llama-cpp;
+                vulkan = config.packages.default.override { useVulkan = true; };
               }
               // lib.optionalAttrs pkgs.stdenv.isLinux {
                 opencl = config.packages.default.override { useOpenCL = true; };
@@ -157,7 +158,6 @@
 
                 mpi-cpu = config.packages.default.override { useMpi = true; };
                 mpi-cuda = config.packages.default.override { useMpi = true; };
-                vulkan = config.packages.default.override { useVulkan = true; };
               }
               // lib.optionalAttrs (system == "x86_64-linux") {
                 rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;

From b9111bd209c7b11b0592450a6ed2e0ca545b2c84 Mon Sep 17 00:00:00 2001
From: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
Date: Tue, 20 Feb 2024 07:01:25 +0000
Subject: [PATCH 10/17] Update ggml_sycl_op_mul_mat_vec_q (#5502)

* Update ggml_sycl_op_mul_mat_vec_q

* Apply suggestions from code review

Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>

* revert suggestion on macro

* fix bug

* Add quant type GGML_TYPE_IQ1_S to unsupported

* fix format

---------

Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
---
 ggml-sycl.cpp | 258 ++++++++++++++------------------------------------
 1 file changed, 69 insertions(+), 189 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index cd4b3a1e1..df1826112 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -9188,174 +9188,22 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
     }
 }
 
-static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK4_0 == 0);
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
-                          vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK4_1 == 0);
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
-                          vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK5_0 == 0);
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
-                          vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK5_1 == 0);
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
-                          vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK8_0 == 0);
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
-                          vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
-                          vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
-                          vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
-                          vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
-                          vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
-                          vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
+template <int qk, int qi, typename block_q_t, int vdr,
+          vec_dot_q_sycl_t vec_dot_q_sycl>
+static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy,
+                                         float *dst, const int ncols,
+                                         const int nrows,
+                                         dpct::queue_ptr stream) {
+  GGML_ASSERT(ncols % QK4_0 == 0);
+  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+  const sycl::range<3> block_nums(1, 1, block_num_y);
+  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+  stream->parallel_for(
+      sycl::nd_range<3>(block_nums * block_dims, block_dims), [=
+  ](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+        mul_mat_vec_q<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
+            vx, vy, dst, ncols, nrows, item_ct1);
+      });
 }
 
 int get_device_index_by_id(int id){
@@ -12095,37 +11943,63 @@ inline void ggml_sycl_op_mul_mat_vec_q(
     const int64_t ne00 = src0->ne[0];
     const int64_t row_diff = row_high - row_low;
 
+    // TODO: support these quantization types
+    GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS ||
+                  src0->type == GGML_TYPE_IQ2_XS ||
+                  src0->type == GGML_TYPE_IQ3_XXS ||
+                  src0->type == GGML_TYPE_IQ1_S));
+
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
-            mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
+          mul_mat_vec_q_sycl_submitter<QK4_0, QI4_0, block_q4_0,
+                                       VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
+              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          break;
         case GGML_TYPE_Q4_1:
-            mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
+          mul_mat_vec_q_sycl_submitter<QK4_1, QI4_1, block_q4_1,
+                                       VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
+              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          break;
         case GGML_TYPE_Q5_0:
-            mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
+          mul_mat_vec_q_sycl_submitter<QK5_0, QI5_0, block_q5_0,
+                                       VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
+              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          break;
         case GGML_TYPE_Q5_1:
-            mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
+          mul_mat_vec_q_sycl_submitter<QK5_1, QI5_1, block_q5_1,
+                                       VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
+              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          break;
         case GGML_TYPE_Q8_0:
-            mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
+          mul_mat_vec_q_sycl_submitter<QK8_0, QI8_0, block_q8_0,
+                                       VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
+              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          break;
         case GGML_TYPE_Q2_K:
-            mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
+          mul_mat_vec_q_sycl_submitter<QK_K, QI2_K, block_q2_K,
+                                       VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
+              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          break;
         case GGML_TYPE_Q3_K:
-            mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
+          mul_mat_vec_q_sycl_submitter<QK_K, QI3_K, block_q3_K,
+                                       VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
+              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          break;
         case GGML_TYPE_Q4_K:
-            mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
+          mul_mat_vec_q_sycl_submitter<QK_K, QI4_K, block_q4_K,
+                                       VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
+              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          break;
         case GGML_TYPE_Q5_K:
-            mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
+          mul_mat_vec_q_sycl_submitter<QK_K, QI5_K, block_q5_K,
+                                       VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
+              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          break;
         case GGML_TYPE_Q6_K:
-            mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
+          mul_mat_vec_q_sycl_submitter<QK_K, QI6_K, block_q6_K,
+                                       VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
+              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          break;
         default:
             GGML_ASSERT(false);
             break;
@@ -12145,7 +12019,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
     const int64_t src1_ncols, const int64_t src1_padded_row_size,
     const dpct::queue_ptr &stream) {
 
-    GGML_TENSOR_BINARY_OP_LOCALS
+    GGML_TENSOR_BINARY_OP_LOCALS;
 
     const int64_t row_diff = row_high - row_low;
 
@@ -15093,6 +14967,12 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
                     return false;
                 }
 
+                if (a->type == GGML_TYPE_IQ1_S) {
+                    return false;
+                }
+                if (a->type == GGML_TYPE_IQ3_XXS) {
+                  return false;
+                }
                 if (a->type == GGML_TYPE_IQ2_XXS) {
                     return false;
                 }

From c0a8c6db371cb3e4379900867b948879f5842201 Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Tue, 20 Feb 2024 08:48:19 +0100
Subject: [PATCH 11/17] server : health endpoint configurable failure on no
 slot (#5594)

---
 examples/server/README.md  |  9 ++++---
 examples/server/server.cpp | 52 +++++++++++++++++++-------------------
 2 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 809e2d37c..f6b9c7402 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -134,10 +134,11 @@ node index.js
 ## API Endpoints
 
 - **GET** `/health`: Returns the current state of the server:
-  - `{"status": "loading model"}` if the model is still being loaded.
-  - `{"status": "error"}` if the model failed to load.
-  - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
-  - `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available
+  - 503 -> `{"status": "loading model"}` if the model is still being loaded.
+  - 500 -> `{"status": "error"}` if the model failed to load.
+  - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
+  - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
+  - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.
 
 - **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 22c344dd4..23482ed95 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2582,40 +2582,40 @@ int main(int argc, char **argv)
         res.set_header("Access-Control-Allow-Headers", "*");
     });
 
-    svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) {
+    svr.Get("/health", [&](const httplib::Request& req, httplib::Response& res) {
         server_state current_state = state.load();
         switch(current_state) {
-            case SERVER_STATE_READY:
-                if (llama.all_slots_are_idle) {
-                    res.set_content(R"({"status": "ok"})", "application/json");
+            case SERVER_STATE_READY: {
+                int available_slots  = 0;
+                int processing_slots = 0;
+                for (llama_client_slot &slot: llama.slots) {
+                    if (slot.available()) {
+                        available_slots++;
+                    } else {
+                        processing_slots++;
+                    }
+                }
+                if (available_slots > 0) {
+                    json health = {
+                            {"status",           "ok"},
+                            {"slots_idle",       available_slots},
+                            {"slots_processing", processing_slots}};
+                    res.set_content(health.dump(), "application/json");
                     res.status = 200; // HTTP OK
                 } else {
-                    int available_slots = 0;
-                    int processing_slots = 0;
-                    for (llama_client_slot & slot : llama.slots) {
-                        if (slot.available()) {
-                            available_slots++;
-                        } else {
-                            processing_slots++;
-                        }
-                    }
-                    if (available_slots > 0) {
-                        json health = {
-                                {"status",           "ok"},
-                                {"slots_idle",       available_slots},
-                                {"slots_processing", processing_slots}};
-                        res.set_content(health.dump(), "application/json");
-                        res.status = 200; // HTTP OK
-                    } else {
-                        json health = {
-                                {"status",           "no slot available"},
-                                {"slots_idle",       available_slots},
-                                {"slots_processing", processing_slots}};
-                        res.set_content(health.dump(), "application/json");
+                    json health = {
+                            {"status",           "no slot available"},
+                            {"slots_idle",       available_slots},
+                            {"slots_processing", processing_slots}};
+                    res.set_content(health.dump(), "application/json");
+                    if (req.has_param("fail_on_no_slot")) {
                         res.status = 503; // HTTP Service Unavailable
+                    } else {
+                        res.status = 200; // HTTP OK
                     }
                 }
                 break;
+            }
             case SERVER_STATE_LOADING_MODEL:
                 res.set_content(R"({"status": "loading model"})", "application/json");
                 res.status = 503; // HTTP Service Unavailable

From 8dbbd75754d43ec7b4bbe42fb287cc2553fdf0e9 Mon Sep 17 00:00:00 2001
From: Haoxiang Fei <tonyfettes@tonyfettes.com>
Date: Mon, 19 Feb 2024 22:58:36 -1100
Subject: [PATCH 12/17] metal : add build system support for embedded metal
 library (#5604)

* add build support for embedded metal library

* Update Makefile

---------

Co-authored-by: Haoxiang Fei <feihaoxiang@idea.edu.cn>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 CMakeLists.txt | 24 ++++++++++++++++++++++++
 Makefile       | 18 ++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 168b133f4..3c4629001 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -110,6 +110,7 @@ option(LLAMA_VULKAN_RUN_TESTS                "llama: run Vulkan tests"
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
 option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
+option(LLAMA_METAL_EMBED_LIBRARY             "llama: embed Metal library"                       OFF)
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
@@ -201,6 +202,29 @@ if (LLAMA_METAL)
     # copy ggml-metal.metal to bin directory
     configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
 
+    if (LLAMA_METAL_EMBED_LIBRARY)
+        enable_language(ASM)
+        add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
+
+        set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal")
+        file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
+        set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
+
+        add_custom_command(
+            OUTPUT ${EMBED_METALLIB_ASSEMBLY}
+            COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY}
+            COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY}
+            COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY}
+            COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY}
+            COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY}
+            COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY}
+            DEPENDS ${METALLIB_SOURCE}
+            COMMENT "Generate assembly for embedded Metal library"
+        )
+
+        set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY})
+    endif()
+
     if (LLAMA_METAL_SHADER_DEBUG)
         # custom command to do the following:
         #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
diff --git a/Makefile b/Makefile
index db5df1b32..211a08d7f 100644
--- a/Makefile
+++ b/Makefile
@@ -533,11 +533,29 @@ ifdef LLAMA_METAL
 ifdef LLAMA_METAL_NDEBUG
 	MK_CPPFLAGS += -DGGML_METAL_NDEBUG
 endif
+ifdef LLAMA_METAL_EMBED_LIBRARY
+	MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
+	OBJS        += ggml-metal-embed.o
+endif
 endif # LLAMA_METAL
 
 ifdef LLAMA_METAL
 ggml-metal.o: ggml-metal.m ggml-metal.h
 	$(CC) $(CFLAGS) -c $< -o $@
+
+ifdef LLAMA_METAL_EMBED_LIBRARY
+ggml-metal-embed.o: ggml-metal.metal
+	@echo "Embedding Metal library"
+	$(eval TEMP_ASSEMBLY=$(shell mktemp))
+	@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
+	@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
+	@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
+	@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
+	@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
+	@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
+	@$(AS) $(TEMP_ASSEMBLY) -o $@
+	@rm -f ${TEMP_ASSEMBLY}
+endif
 endif # LLAMA_METAL
 
 ifdef LLAMA_MPI

From 5207b3fbc500f89dfe528693e96540956dbaed96 Mon Sep 17 00:00:00 2001
From: Dane Madsen <dane_madsen@hotmail.com>
Date: Tue, 20 Feb 2024 21:00:23 +1100
Subject: [PATCH 13/17] readme : update UI list (#5605)

* Add maid to ui list

* Specify licence
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 70866e249..747d2e98b 100644
--- a/README.md
+++ b/README.md
@@ -156,6 +156,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
 - [semperai/amica](https://github.com/semperai/amica)
 - [withcatai/catai](https://github.com/withcatai/catai)
+- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
 
 ---
 

From 9c405c9f9a7cfd23511fd6b2de05dc72481119b4 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Tue, 20 Feb 2024 15:58:27 +0100
Subject: [PATCH 14/17] Server: use llama_chat_apply_template (#5593)

* server: use llama_chat_apply_template

* server: remove trailing space

* server: fix format_chat

* server: fix help message

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* server: fix formatted_chat

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 examples/server/oai.hpp    |  6 ++--
 examples/server/server.cpp | 17 +++++-----
 examples/server/utils.hpp  | 69 ++++++++++++++++++--------------------
 llama.cpp                  |  2 +-
 4 files changed, 45 insertions(+), 49 deletions(-)

diff --git a/examples/server/oai.hpp b/examples/server/oai.hpp
index 2eca8a9fb..ff4ad6994 100644
--- a/examples/server/oai.hpp
+++ b/examples/server/oai.hpp
@@ -15,13 +15,11 @@
 using json = nlohmann::json;
 
 inline static json oaicompat_completion_params_parse(
+    const struct llama_model * model,
     const json &body, /* openai api json semantics */
     const std::string &chat_template)
 {
     json llama_params;
-    std::string formatted_prompt = chat_template == "chatml"
-        ? format_chatml(body["messages"])  // OpenAI 'messages' to chatml (with <|im_start|>,...)
-        : format_llama2(body["messages"]); // OpenAI 'messages' to llama2 (with [INST],...)
 
     llama_params["__oaicompat"] = true;
 
@@ -34,7 +32,7 @@ inline static json oaicompat_completion_params_parse(
     // https://platform.openai.com/docs/api-reference/chat/create
     llama_sampling_params default_sparams;
     llama_params["model"]             = json_value(body, "model", std::string("unknown"));
-    llama_params["prompt"]            = formatted_prompt;
+    llama_params["prompt"]            = format_chat(model, chat_template, body["messages"]);
     llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
     llama_params["temperature"]       = json_value(body, "temperature", 0.0);
     llama_params["top_k"]             = json_value(body, "top_k", default_sparams.top_k);
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 23482ed95..c7821eca6 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -37,7 +37,7 @@ struct server_params
     std::string hostname = "127.0.0.1";
     std::vector<std::string> api_keys;
     std::string public_path = "examples/server/public";
-    std::string chat_template = "chatml";
+    std::string chat_template = "";
     int32_t port = 8080;
     int32_t read_timeout = 600;
     int32_t write_timeout = 600;
@@ -1937,8 +1937,9 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("                            types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
     printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
     printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
-    printf("  --chat-template FORMAT_NAME");
-    printf("                            set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str());
+    printf("  --chat-template JINJA_TEMPLATE\n");
+    printf("                            set custom jinja chat template (default: template taken from model's metadata)\n");
+    printf("                            Note: only commonly used templates are accepted, since we don't have jinja parser\n");
     printf("\n");
 }
 
@@ -2389,13 +2390,13 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 invalid_param = true;
                 break;
             }
-            std::string value(argv[i]);
-            if (value != "chatml" && value != "llama2") {
-                fprintf(stderr, "error: chat template can be \"llama2\" or \"chatml\", but got: %s\n", value.c_str());
+            if (!verify_custom_template(argv[i])) {
+                fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
+                fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
                 invalid_param = true;
                 break;
             }
-            sparams.chat_template = value;
+            sparams.chat_template = argv[i];
         }
         else if (arg == "--override-kv")
         {
@@ -2913,7 +2914,7 @@ int main(int argc, char **argv)
                 if (!validate_api_key(req, res)) {
                     return;
                 }
-                json data = oaicompat_completion_params_parse(json::parse(req.body), sparams.chat_template);
+                json data = oaicompat_completion_params_parse(llama.model, json::parse(req.body), sparams.chat_template);
 
                 const int task_id = llama.queue_tasks.get_new_id();
                 llama.queue_results.add_waiting_task_id(task_id);
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 0ee670dba..e954fb0ef 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -167,50 +167,47 @@ static T json_value(const json &body, const std::string &key, const T &default_v
         : default_value;
 }
 
-inline std::string format_llama2(std::vector<json> messages)
-{
-    std::ostringstream output;
-    bool is_inside_turn = false;
-
-    for (auto it = messages.begin(); it != messages.end(); ++it) {
-        if (!is_inside_turn) {
-            output << "[INST] ";
-        }
-        std::string role    = json_value(*it, "role", std::string("user"));
-        std::string content = json_value(*it, "content", std::string(""));
-        if (role == "system") {
-            output << "<<SYS>>\n" << content << "\n<<SYS>>\n\n";
-            is_inside_turn = true;
-        } else if (role == "user") {
-            output << content << " [/INST]";
-            is_inside_turn = true;
-        } else {
-            output << " " << content << " </s>";
-            is_inside_turn = false;
-        }
-    }
-
-    LOG_VERBOSE("format_llama2", {{"text", output.str()}});
-
-    return output.str();
+// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
+inline bool verify_custom_template(const std::string & tmpl) {
+    llama_chat_message chat[] = {{"user", "test"}};
+    std::vector<char> buf(1);
+    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
+    return res >= 0;
 }
 
-inline std::string format_chatml(std::vector<json> messages)
+// Format given chat. If tmpl is empty, we take the template from model metadata
+inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages)
 {
-    std::ostringstream chatml_msgs;
+    size_t alloc_size = 0;
+    // vector holding all allocated string to be passed to llama_chat_apply_template
+    std::vector<std::string> str(messages.size() * 2);
+    std::vector<llama_chat_message> chat(messages.size());
 
-    for (auto it = messages.begin(); it != messages.end(); ++it) {
-        chatml_msgs << "<|im_start|>"
-                    << json_value(*it, "role",    std::string("user")) << '\n';
-        chatml_msgs << json_value(*it, "content", std::string(""))
-                    << "<|im_end|>\n";
+    for (size_t i = 0; i < messages.size(); ++i) {
+        auto &curr_msg = messages[i];
+        str[i*2 + 0]    = json_value(curr_msg, "role",    std::string(""));
+        str[i*2 + 1]    = json_value(curr_msg, "content", std::string(""));
+        alloc_size     += str[i*2 + 1].length();
+        chat[i].role    = str[i*2 + 0].c_str();
+        chat[i].content = str[i*2 + 1].c_str();
     }
 
-    chatml_msgs << "<|im_start|>assistant" << '\n';
+    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
+    std::vector<char> buf(alloc_size * 2);
 
-    LOG_VERBOSE("format_chatml", {{"text", chatml_msgs.str()}});
+    // run the first time to get the total output length
+    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
 
-    return chatml_msgs.str();
+    // if it turns out that our buffer is too small, we resize it
+    if ((size_t) res > buf.size()) {
+        buf.resize(res);
+        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
+    }
+
+    std::string formatted_chat(buf.data(), res);
+    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
+
+    return formatted_chat;
 }
 
 //
diff --git a/llama.cpp b/llama.cpp
index 5de07dfa9..4296eca32 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12602,7 +12602,7 @@ LLAMA_API int32_t llama_chat_apply_template(
         // load template from model
         std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
         std::string template_key = "tokenizer.chat_template";
-        int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), curr_tmpl.size());
+        int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
         if (res < 0) {
             // worst case: there is no information about template, we will use chatml by default
             curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal

From 4ed8e4fbef6a15afd993bfcd9ffa279841e18ef1 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 20 Feb 2024 18:30:27 +0100
Subject: [PATCH 15/17] llava : add explicit instructions for llava-1.6 (#5611)

This commit contains a suggestion for the README.md in the llava
example. The suggestion adds explicit instructions for how to convert
a llava-1.6 model and run it using llava-cli.

The motivation for this is that having explicit instructions similar to
the 1.5 instructions will make it easier for users to try this out.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 examples/llava/README.md | 38 ++++++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/examples/llava/README.md b/examples/llava/README.md
index e42db6e5a..25ea96715 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -59,14 +59,40 @@ python ./convert.py ../llava-v1.5-7b --skip-unknown
 Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
 
 ## LLaVA 1.6 gguf conversion
-
-1) Backup your pth/safetensor model files as llava-surgery modifies them
-2) Use `python llava-surgery-v2.py -C -m /path/to/hf-model` which also supports llava-1.5 variants pytorch as well as safetensor models:
+1) First clone a LLaVA 1.6 model:
+```console
+git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
+```
+2) Backup your pth/safetensor model files as llava-surgery modifies them
+3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
+```console
+python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
+```
 - you will find a llava.projector and a llava.clip file in your model directory
-3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory (https://huggingface.co/cmp-nct/llava-1.6-gguf/blob/main/config_vit.json) and rename it to config.json.
-4) Create the visual gguf model: `python ./examples/llava/convert-image-encoder-to-gguf.py -m ../path/to/vit --llava-projector ../path/to/llava.projector --output-dir ../path/to/output --clip-model-is-vision`
+4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
+```console
+mkdir vit
+cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
+cp ../llava-v1.6-vicuna-7b/llava.projector vit/
+curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
+```
+
+5) Create the visual gguf model:
+```console
+python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
+```
 - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
-5) Everything else as usual: convert.py the hf model, quantize as needed
+
+6) Then convert the model to gguf format:
+```console
+python ./convert.py ../llava-v1.6-vicuna-7b/
+```
+
+7) And finally we can run the llava-cli using the 1.6 model version:
+```console
+./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
+```
+
 **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
 **note** llava-1.6 greatly benefits from batched prompt processing (defaults work)
 

From 06bf2cf8c406e6b70dbf9b431a02fa0ad845b9df Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Tue, 20 Feb 2024 20:06:17 +0100
Subject: [PATCH 16/17] make : fix debug build with CUDA (#5616)

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 211a08d7f..41c79c135 100644
--- a/Makefile
+++ b/Makefile
@@ -173,7 +173,7 @@ ifdef LLAMA_DEBUG
 	MK_LDFLAGS  += -g
 
 	ifeq ($(UNAME_S),Linux)
-		MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
+		MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
 	endif
 else
 	MK_CPPFLAGS += -DNDEBUG

From 6560bed3f066c876682464762cad90f1e28e3f1b Mon Sep 17 00:00:00 2001
From: CJ Pais <cj@cjpais.com>
Date: Tue, 20 Feb 2024 11:07:22 -0800
Subject: [PATCH 17/17] server : support llava 1.6 (#5553)

* server: init working 1.6

* move clip_image to header

* remove commented code

* remove c++ style from header

* remove todo

* expose llava_image_embed_make_with_clip_img

* fix zig build
---
 Makefile                   |  2 +-
 build.zig                  |  3 ++-
 examples/llava/llava.cpp   |  2 +-
 examples/llava/llava.h     |  2 ++
 examples/server/server.cpp | 36 +++---------------------------------
 5 files changed, 9 insertions(+), 36 deletions(-)

diff --git a/Makefile b/Makefile
index 41c79c135..f03faf6ed 100644
--- a/Makefile
+++ b/Makefile
@@ -719,7 +719,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
diff --git a/build.zig b/build.zig
index 699738f3d..c0af454dc 100644
--- a/build.zig
+++ b/build.zig
@@ -123,6 +123,7 @@ pub fn build(b: *std.build.Builder) !void {
     const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
     const train = make.obj("train", "common/train.cpp");
     const clip = make.obj("clip", "examples/llava/clip.cpp");
+    const llava = make.obj("llava", "examples/llava/llava.cpp");
 
     _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
     _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
@@ -131,7 +132,7 @@ pub fn build(b: *std.build.Builder) !void {
     _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
     _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
 
-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava });
     if (server.target.isWindows()) {
         server.linkSystemLibrary("ws2_32");
     }
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 4cb65a07b..1a1cf7c78 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -311,7 +311,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
     return true;
 }
 
-static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
+bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
     float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
     if (!image_embd) {
         fprintf(stderr, "Unable to allocate memory for image embeddings\n");
diff --git a/examples/llava/llava.h b/examples/llava/llava.h
index 9e9466a5d..2d40f3f1d 100644
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@@ -31,6 +31,8 @@ struct llava_image_embed {
 /** sanity check for clip <-> llava embed size match */
 LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
 
+LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
+
 /** build an image embed from image file bytes */
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
 /** build an image embed from a path to an image filename */
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c7821eca6..eb01729fa 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -5,6 +5,7 @@
 #include "oai.hpp"
 
 #include "../llava/clip.h"
+#include "../llava/llava.h"
 
 #include "stb_image.h"
 
@@ -997,43 +998,12 @@ struct llama_server_context
             {
                 continue;
             }
-            clip_image_f32_batch img_res_v;
-            img_res_v.size = 0;
-            img_res_v.data = nullptr;
-            if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v))
-            {
-                LOG_TEE("Error processing the given image");
-                clip_free(clp_ctx);
-                clip_image_f32_batch_free(img_res_v);
-                return false;
-            }
-            if (img_res_v.size == 0)
-            {
+
+            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
                 LOG_TEE("Error processing the given image");
                 return false;
             }
 
-            // note: assumes only one image was returned by clip_image_preprocess
-            clip_image_f32 * img_res = img_res_v.data;
-
-            img.image_tokens = clip_n_patches(clp_ctx);
-            img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
-            if (!img.image_embedding)
-            {
-                LOG_TEE("Unable to allocate memory for image embeddings\n");
-                clip_image_f32_batch_free(img_res_v);
-                clip_free(clp_ctx);
-                return false;
-            }
-            LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
-            if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
-            {
-                LOG_TEE("Unable to encode image\n");
-                clip_image_f32_batch_free(img_res_v);
-                return false;
-            }
-
-            clip_image_f32_batch_free(img_res_v);
 
             img.request_encode_image = false;
         }