From f24ed14ee0ce28dfe98115c378b37da144912016 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Mon, 19 Feb 2024 15:54:12 -0500 Subject: [PATCH 01/17] make : pass CPPFLAGS directly to nvcc, not via -Xcompiler (#5598) --- Makefile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 63b4af9ba..db5df1b32 100644 --- a/Makefile +++ b/Makefile @@ -446,9 +446,9 @@ ifdef LLAMA_CUDA_CCBIN endif ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ifdef JETSON_EOL_MODULE_DETECT - $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ + $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ else - $(NVCC) $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ + $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ endif # JETSON_EOL_MODULE_DETECT endif # LLAMA_CUBLAS @@ -549,9 +549,10 @@ GF_CC := $(CC) include scripts/get-flags.mk # combine build flags with cmdline overrides -override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS) -BASE_CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS) -override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) +override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) +override CFLAGS := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS) +BASE_CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS) +override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS) override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS) override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) From 40c3a6c1e11040088b4a1ce0abc4651cb3011dd4 Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 19 Feb 2024 23:40:26 +0100 Subject: [PATCH 02/17] cuda : ignore peer access already enabled errors (#5597) * cuda : ignore peer access already enabled errors * fix hip --- ggml-cuda.cu | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e091dbdc1..6caae56b0 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -54,6 +54,8 @@ #define cudaDeviceProp hipDeviceProp_t #define cudaDeviceSynchronize hipDeviceSynchronize #define cudaError_t hipError_t +#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled +#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled #define cudaEventCreateWithFlags hipEventCreateWithFlags #define cudaEventDisableTiming hipEventDisableTiming #define cudaEventRecord hipEventRecord @@ -9325,9 +9327,15 @@ static void ggml_cuda_set_peer_access(const int n_tokens) { CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other)); if (can_access_peer) { if (enable_peer_access) { - CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0)); + cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0); + if (err != cudaErrorPeerAccessAlreadyEnabled) { + CUDA_CHECK(err); + } } else { - CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other)); + cudaError_t err = cudaDeviceDisablePeerAccess(id_other); + if (err != cudaErrorPeerAccessNotEnabled) { + CUDA_CHECK(err); + } } } } @@ -10999,10 +11007,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backe UNUSED(buffer); } -// unused at the moment -//static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) { -// return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name; -//} +static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name; + UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds +} GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; @@ -11390,7 +11398,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT); - assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); + assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer)); assert(node->src[j]->extra != nullptr); } } From 5dde5408978eda22242b87e22e306d1c2d1a5834 Mon Sep 17 00:00:00 2001 From: Mathijs de Bruin Date: Sat, 3 Feb 2024 17:56:46 +0000 Subject: [PATCH 03/17] Allow for Vulkan build with Accelerate. Closes #5304 --- ggml.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml.c b/ggml.c index 4ee2c5e11..d129df505 100644 --- a/ggml.c +++ b/ggml.c @@ -273,6 +273,8 @@ inline static void * ggml_calloc(size_t num, size_t size) { #include #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions #include "ggml-opencl.h" +#elif defined(GGML_USE_VULKAN) +#include "ggml-vulkan.h" #endif #elif defined(GGML_USE_OPENBLAS) #if defined(GGML_BLAS_USE_MKL) From 42f664a3825dfde13a32c3577ab66d10c56f3aa6 Mon Sep 17 00:00:00 2001 From: Mathijs de Bruin Date: Sat, 3 Feb 2024 18:00:11 +0000 Subject: [PATCH 04/17] Resolve ErrorIncompatibleDriver with Vulkan on MacOS. Refs: - https://chat.openai.com/share/7020ce72-65fc-45ec-b7be-9d9d798a5f3f - https://github.com/SaschaWillems/Vulkan/issues/954 - https://github.com/haasn/libplacebo/issues/128 - https://github.com/KhronosGroup/Vulkan-Samples/issues/476 --- ggml-vulkan.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 4a30414df..e9e966dbf 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -1109,8 +1109,10 @@ static void ggml_vk_instance_init() { #ifdef GGML_VULKAN_VALIDATE "VK_EXT_validation_features", #endif + "VK_KHR_portability_enumeration", }; - vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions); + + vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR), &app_info, layers, extensions); #ifdef GGML_VULKAN_VALIDATE const std::vector features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices }; vk::ValidationFeaturesEXT validation_features = { From d8c054517dc24f1316f3be12a98fff383e1e93e3 Mon Sep 17 00:00:00 2001 From: Mathijs de Bruin Date: Tue, 6 Feb 2024 14:39:22 +0000 Subject: [PATCH 05/17] Add preprocessor checks for Apple devices. Based on work by @rbourgeat in https://github.com/ggerganov/llama.cpp/pull/5322/files --- ggml-vulkan.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index e9e966dbf..33b8a9061 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -1109,10 +1109,15 @@ static void ggml_vk_instance_init() { #ifdef GGML_VULKAN_VALIDATE "VK_EXT_validation_features", #endif +#ifdef __APPLE__ "VK_KHR_portability_enumeration", +#endif }; + vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions); +#ifdef __APPLE__ + instance_create_info.flags = vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR; +#endif - vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR), &app_info, layers, extensions); #ifdef GGML_VULKAN_VALIDATE const std::vector features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices }; vk::ValidationFeaturesEXT validation_features = { From f50db6ae0bdcb5f8593ca6ca46dfa03b177faa2f Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 10 Feb 2024 22:14:52 +0100 Subject: [PATCH 06/17] Add check for VK_KHR_portability_enumeration for MoltenVK support --- ggml-vulkan.cpp | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 33b8a9061..37123ac8f 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -1100,23 +1100,44 @@ static void ggml_vk_instance_init() { #endif vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION }; - const std::vector layers = { + + const std::vector instance_extensions = vk::enumerateInstanceExtensionProperties(); +#ifdef __APPLE__ + bool portability_enumeration_ext = false; + // Check for portability enumeration extension for MoltenVK support + for (const auto& properties : instance_extensions) { + if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) { + portability_enumeration_ext = true; + break; + } + } + if (!portability_enumeration_ext) { + std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl; + } +#endif + + std::vector layers = { #ifdef GGML_VULKAN_VALIDATE "VK_LAYER_KHRONOS_validation", #endif }; - const std::vector extensions = { + std::vector extensions = { #ifdef GGML_VULKAN_VALIDATE "VK_EXT_validation_features", -#endif -#ifdef __APPLE__ - "VK_KHR_portability_enumeration", #endif }; - vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions); #ifdef __APPLE__ - instance_create_info.flags = vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR; + if (portability_enumeration_ext) { + extensions.push_back("VK_KHR_portability_enumeration"); + } #endif + vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions); +#ifdef __APPLE__ + if (portability_enumeration_ext) { + instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR; + } +#endif + #ifdef GGML_VULKAN_VALIDATE const std::vector features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices }; @@ -1175,12 +1196,12 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { vk_instance.devices[idx] = std::make_shared(); ctx->device = vk_instance.devices[idx]; ctx->device.lock()->physical_device = devices[dev_num]; - std::vector ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties(); + const std::vector ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties(); bool maintenance4_support = false; // Check if maintenance4 is supported - for (auto properties : ext_props) { + for (const auto& properties : ext_props) { if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { maintenance4_support = true; } @@ -1211,7 +1232,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { bool fp16_storage = false; bool fp16_compute = false; - for (auto properties : ext_props) { + for (const auto& properties : ext_props) { if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { fp16_storage = true; } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { From bb9dcd560a7e81265398b0d463c40f3e467daf19 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 14 Feb 2024 20:57:17 +0100 Subject: [PATCH 07/17] Refactor validation and enumeration platform checks into functions to clean up ggml_vk_instance_init() --- ggml-vulkan.cpp | 101 ++++++++++++++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 38 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 37123ac8f..4e5eaff15 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -1091,7 +1091,10 @@ static void ggml_vk_print_gpu_info(size_t idx) { } } -static void ggml_vk_instance_init() { +static bool ggml_vk_instance_validation_ext_available(const std::vector& instance_extensions); +static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector& instance_extensions); + +void ggml_vk_instance_init() { if (vk_instance_initialized) { return; } @@ -1102,54 +1105,40 @@ static void ggml_vk_instance_init() { vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION }; const std::vector instance_extensions = vk::enumerateInstanceExtensionProperties(); -#ifdef __APPLE__ - bool portability_enumeration_ext = false; - // Check for portability enumeration extension for MoltenVK support - for (const auto& properties : instance_extensions) { - if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) { - portability_enumeration_ext = true; - break; - } - } - if (!portability_enumeration_ext) { - std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl; - } -#endif + const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions); + const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions); - std::vector layers = { -#ifdef GGML_VULKAN_VALIDATE - "VK_LAYER_KHRONOS_validation", -#endif - }; - std::vector extensions = { -#ifdef GGML_VULKAN_VALIDATE - "VK_EXT_validation_features", -#endif - }; -#ifdef __APPLE__ + std::vector layers; + + if (validation_ext) { + layers.push_back("VK_LAYER_KHRONOS_validation"); + } + std::vector extensions; + if (validation_ext) { + extensions.push_back("VK_EXT_validation_features"); + } if (portability_enumeration_ext) { extensions.push_back("VK_KHR_portability_enumeration"); } -#endif vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions); -#ifdef __APPLE__ if (portability_enumeration_ext) { instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR; } -#endif + std::vector features_enable; + vk::ValidationFeaturesEXT validation_features; -#ifdef GGML_VULKAN_VALIDATE - const std::vector features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices }; - vk::ValidationFeaturesEXT validation_features = { - features_enable, - {}, - }; - validation_features.setPNext(nullptr); - instance_create_info.setPNext(&validation_features); + if (validation_ext) { + features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices }; + validation_features = { + features_enable, + {}, + }; + validation_features.setPNext(nullptr); + instance_create_info.setPNext(&validation_features); - std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; -#endif + std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; + } vk_instance.instance = vk::createInstance(instance_create_info); memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES); @@ -5329,6 +5318,42 @@ GGML_CALL int ggml_backend_vk_reg_devices() { return vk_instance.device_indices.size(); } +// Extension availability +static bool ggml_vk_instance_validation_ext_available(const std::vector& instance_extensions) { +#ifdef GGML_VULKAN_VALIDATE + bool portability_enumeration_ext = false; + // Check for portability enumeration extension for MoltenVK support + for (const auto& properties : instance_extensions) { + if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) { + return true; + } + } + if (!portability_enumeration_ext) { + std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl; + } +#endif + return false; + + UNUSED(instance_extensions); +} +static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector& instance_extensions) { +#ifdef __APPLE__ + bool portability_enumeration_ext = false; + // Check for portability enumeration extension for MoltenVK support + for (const auto& properties : instance_extensions) { + if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) { + return true; + } + } + if (!portability_enumeration_ext) { + std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl; + } +#endif + return false; + + UNUSED(instance_extensions); +} + // checks #ifdef GGML_VULKAN_CHECK_RESULTS From 22f83f0c383e12106692b8afc224d61b8993a52c Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 10 Feb 2024 22:18:33 +0100 Subject: [PATCH 08/17] Enable Vulkan MacOS CI --- .devops/nix/package.nix | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index ad23f7dd7..815db6a2d 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -255,11 +255,11 @@ effectiveStdenv.mkDerivation ( # Configurations we don't want even the CI to evaluate. Results in the # "unsupported platform" messages. This is mostly a no-op, because # cudaPackages would've refused to evaluate anyway. - badPlatforms = optionals (useCuda || useOpenCL || useVulkan) lib.platforms.darwin; + badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin; # Configurations that are known to result in build failures. Can be # overridden by importing Nixpkgs with `allowBroken = true`. - broken = (useMetalKit && !effectiveStdenv.isDarwin) || (useVulkan && effectiveStdenv.isDarwin); + broken = (useMetalKit && !effectiveStdenv.isDarwin); description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}"; homepage = "https://github.com/ggerganov/llama.cpp/"; From 633782b8d949f24b619e6c68ee37b5cc79167173 Mon Sep 17 00:00:00 2001 From: Mathijs de Bruin Date: Tue, 13 Feb 2024 20:28:02 +0000 Subject: [PATCH 09/17] nix: now that we can do so, allow MacOS to build Vulkan binaries Author: Philip Taron Date: Tue Feb 13 20:28:02 2024 +0000 --- flake.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flake.nix b/flake.nix index ad2f9b295..dc4e503c3 100644 --- a/flake.nix +++ b/flake.nix @@ -150,6 +150,7 @@ packages = { default = config.legacyPackages.llamaPackages.llama-cpp; + vulkan = config.packages.default.override { useVulkan = true; }; } // lib.optionalAttrs pkgs.stdenv.isLinux { opencl = config.packages.default.override { useOpenCL = true; }; @@ -157,7 +158,6 @@ mpi-cpu = config.packages.default.override { useMpi = true; }; mpi-cuda = config.packages.default.override { useMpi = true; }; - vulkan = config.packages.default.override { useVulkan = true; }; } // lib.optionalAttrs (system == "x86_64-linux") { rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp; From b9111bd209c7b11b0592450a6ed2e0ca545b2c84 Mon Sep 17 00:00:00 2001 From: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> Date: Tue, 20 Feb 2024 07:01:25 +0000 Subject: [PATCH 10/17] Update ggml_sycl_op_mul_mat_vec_q (#5502) * Update ggml_sycl_op_mul_mat_vec_q * Apply suggestions from code review Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> * revert suggestion on macro * fix bug * Add quant type GGML_TYPE_IQ1_S to unsupported * fix format --------- Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> --- ggml-sycl.cpp | 258 ++++++++++++++------------------------------------ 1 file changed, 69 insertions(+), 189 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index cd4b3a1e1..df1826112 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -9188,174 +9188,22 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y, } } -static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK4_0 == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); -} - -static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK4_1 == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); -} - -static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK5_0 == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); -} - -static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK5_1 == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); -} - -static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK8_0 == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); -} - -static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); -} - -static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); -} - -static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); -} - -static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); -} - -static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1); - }); +template +static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), [= + ](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); } int get_device_index_by_id(int id){ @@ -12095,37 +11943,63 @@ inline void ggml_sycl_op_mul_mat_vec_q( const int64_t ne00 = src0->ne[0]; const int64_t row_diff = row_high - row_low; + // TODO: support these quantization types + GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS || + src0->type == GGML_TYPE_IQ2_XS || + src0->type == GGML_TYPE_IQ3_XXS || + src0->type == GGML_TYPE_IQ1_S)); + switch (src0->type) { case GGML_TYPE_Q4_0: - mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q4_1: - mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q5_0: - mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q5_1: - mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q8_0: - mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q2_K: - mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q3_K: - mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q4_K: - mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q5_K: - mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_Q6_K: - mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); - break; + mul_mat_vec_q_sycl_submitter( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; default: GGML_ASSERT(false); break; @@ -12145,7 +12019,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec( const int64_t src1_ncols, const int64_t src1_padded_row_size, const dpct::queue_ptr &stream) { - GGML_TENSOR_BINARY_OP_LOCALS + GGML_TENSOR_BINARY_OP_LOCALS; const int64_t row_diff = row_high - row_low; @@ -15093,6 +14967,12 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten return false; } + if (a->type == GGML_TYPE_IQ1_S) { + return false; + } + if (a->type == GGML_TYPE_IQ3_XXS) { + return false; + } if (a->type == GGML_TYPE_IQ2_XXS) { return false; } From c0a8c6db371cb3e4379900867b948879f5842201 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Tue, 20 Feb 2024 08:48:19 +0100 Subject: [PATCH 11/17] server : health endpoint configurable failure on no slot (#5594) --- examples/server/README.md | 9 ++++--- examples/server/server.cpp | 52 +++++++++++++++++++------------------- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 809e2d37c..f6b9c7402 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -134,10 +134,11 @@ node index.js ## API Endpoints - **GET** `/health`: Returns the current state of the server: - - `{"status": "loading model"}` if the model is still being loaded. - - `{"status": "error"}` if the model failed to load. - - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below. - - `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available + - 503 -> `{"status": "loading model"}` if the model is still being loaded. + - 500 -> `{"status": "error"}` if the model failed to load. + - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below. + - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available. + - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available. - **POST** `/completion`: Given a `prompt`, it returns the predicted completion. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 22c344dd4..23482ed95 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2582,40 +2582,40 @@ int main(int argc, char **argv) res.set_header("Access-Control-Allow-Headers", "*"); }); - svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) { + svr.Get("/health", [&](const httplib::Request& req, httplib::Response& res) { server_state current_state = state.load(); switch(current_state) { - case SERVER_STATE_READY: - if (llama.all_slots_are_idle) { - res.set_content(R"({"status": "ok"})", "application/json"); + case SERVER_STATE_READY: { + int available_slots = 0; + int processing_slots = 0; + for (llama_client_slot &slot: llama.slots) { + if (slot.available()) { + available_slots++; + } else { + processing_slots++; + } + } + if (available_slots > 0) { + json health = { + {"status", "ok"}, + {"slots_idle", available_slots}, + {"slots_processing", processing_slots}}; + res.set_content(health.dump(), "application/json"); res.status = 200; // HTTP OK } else { - int available_slots = 0; - int processing_slots = 0; - for (llama_client_slot & slot : llama.slots) { - if (slot.available()) { - available_slots++; - } else { - processing_slots++; - } - } - if (available_slots > 0) { - json health = { - {"status", "ok"}, - {"slots_idle", available_slots}, - {"slots_processing", processing_slots}}; - res.set_content(health.dump(), "application/json"); - res.status = 200; // HTTP OK - } else { - json health = { - {"status", "no slot available"}, - {"slots_idle", available_slots}, - {"slots_processing", processing_slots}}; - res.set_content(health.dump(), "application/json"); + json health = { + {"status", "no slot available"}, + {"slots_idle", available_slots}, + {"slots_processing", processing_slots}}; + res.set_content(health.dump(), "application/json"); + if (req.has_param("fail_on_no_slot")) { res.status = 503; // HTTP Service Unavailable + } else { + res.status = 200; // HTTP OK } } break; + } case SERVER_STATE_LOADING_MODEL: res.set_content(R"({"status": "loading model"})", "application/json"); res.status = 503; // HTTP Service Unavailable From 8dbbd75754d43ec7b4bbe42fb287cc2553fdf0e9 Mon Sep 17 00:00:00 2001 From: Haoxiang Fei Date: Mon, 19 Feb 2024 22:58:36 -1100 Subject: [PATCH 12/17] metal : add build system support for embedded metal library (#5604) * add build support for embedded metal library * Update Makefile --------- Co-authored-by: Haoxiang Fei Co-authored-by: Georgi Gerganov --- CMakeLists.txt | 24 ++++++++++++++++++++++++ Makefile | 18 ++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 168b133f4..3c4629001 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -110,6 +110,7 @@ option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests" option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF) +option(LLAMA_METAL_EMBED_LIBRARY "llama: embed Metal library" OFF) option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) @@ -201,6 +202,29 @@ if (LLAMA_METAL) # copy ggml-metal.metal to bin directory configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) + if (LLAMA_METAL_EMBED_LIBRARY) + enable_language(ASM) + add_compile_definitions(GGML_METAL_EMBED_LIBRARY) + + set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal") + file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated") + set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s") + + add_custom_command( + OUTPUT ${EMBED_METALLIB_ASSEMBLY} + COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY} + COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY} + COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY} + COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY} + COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY} + COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY} + DEPENDS ${METALLIB_SOURCE} + COMMENT "Generate assembly for embedded Metal library" + ) + + set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY}) + endif() + if (LLAMA_METAL_SHADER_DEBUG) # custom command to do the following: # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air diff --git a/Makefile b/Makefile index db5df1b32..211a08d7f 100644 --- a/Makefile +++ b/Makefile @@ -533,11 +533,29 @@ ifdef LLAMA_METAL ifdef LLAMA_METAL_NDEBUG MK_CPPFLAGS += -DGGML_METAL_NDEBUG endif +ifdef LLAMA_METAL_EMBED_LIBRARY + MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY + OBJS += ggml-metal-embed.o +endif endif # LLAMA_METAL ifdef LLAMA_METAL ggml-metal.o: ggml-metal.m ggml-metal.h $(CC) $(CFLAGS) -c $< -o $@ + +ifdef LLAMA_METAL_EMBED_LIBRARY +ggml-metal-embed.o: ggml-metal.metal + @echo "Embedding Metal library" + $(eval TEMP_ASSEMBLY=$(shell mktemp)) + @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) + @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) + @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) + @echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY) + @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) + @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) + @$(AS) $(TEMP_ASSEMBLY) -o $@ + @rm -f ${TEMP_ASSEMBLY} +endif endif # LLAMA_METAL ifdef LLAMA_MPI From 5207b3fbc500f89dfe528693e96540956dbaed96 Mon Sep 17 00:00:00 2001 From: Dane Madsen Date: Tue, 20 Feb 2024 21:00:23 +1100 Subject: [PATCH 13/17] readme : update UI list (#5605) * Add maid to ui list * Specify licence --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 70866e249..747d2e98b 100644 --- a/README.md +++ b/README.md @@ -156,6 +156,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [pythops/tenere](https://github.com/pythops/tenere) (AGPL) - [semperai/amica](https://github.com/semperai/amica) - [withcatai/catai](https://github.com/withcatai/catai) +- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT) --- From 9c405c9f9a7cfd23511fd6b2de05dc72481119b4 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 20 Feb 2024 15:58:27 +0100 Subject: [PATCH 14/17] Server: use llama_chat_apply_template (#5593) * server: use llama_chat_apply_template * server: remove trailing space * server: fix format_chat * server: fix help message Co-authored-by: Georgi Gerganov * server: fix formatted_chat --------- Co-authored-by: Georgi Gerganov --- examples/server/oai.hpp | 6 ++-- examples/server/server.cpp | 17 +++++----- examples/server/utils.hpp | 69 ++++++++++++++++++-------------------- llama.cpp | 2 +- 4 files changed, 45 insertions(+), 49 deletions(-) diff --git a/examples/server/oai.hpp b/examples/server/oai.hpp index 2eca8a9fb..ff4ad6994 100644 --- a/examples/server/oai.hpp +++ b/examples/server/oai.hpp @@ -15,13 +15,11 @@ using json = nlohmann::json; inline static json oaicompat_completion_params_parse( + const struct llama_model * model, const json &body, /* openai api json semantics */ const std::string &chat_template) { json llama_params; - std::string formatted_prompt = chat_template == "chatml" - ? format_chatml(body["messages"]) // OpenAI 'messages' to chatml (with <|im_start|>,...) - : format_llama2(body["messages"]); // OpenAI 'messages' to llama2 (with [INST],...) llama_params["__oaicompat"] = true; @@ -34,7 +32,7 @@ inline static json oaicompat_completion_params_parse( // https://platform.openai.com/docs/api-reference/chat/create llama_sampling_params default_sparams; llama_params["model"] = json_value(body, "model", std::string("unknown")); - llama_params["prompt"] = formatted_prompt; + llama_params["prompt"] = format_chat(model, chat_template, body["messages"]); llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); llama_params["temperature"] = json_value(body, "temperature", 0.0); llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 23482ed95..c7821eca6 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -37,7 +37,7 @@ struct server_params std::string hostname = "127.0.0.1"; std::vector api_keys; std::string public_path = "examples/server/public"; - std::string chat_template = "chatml"; + std::string chat_template = ""; int32_t port = 8080; int32_t read_timeout = 600; int32_t write_timeout = 600; @@ -1937,8 +1937,9 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`"); printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`"); - printf(" --chat-template FORMAT_NAME"); - printf(" set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str()); + printf(" --chat-template JINJA_TEMPLATE\n"); + printf(" set custom jinja chat template (default: template taken from model's metadata)\n"); + printf(" Note: only commonly used templates are accepted, since we don't have jinja parser\n"); printf("\n"); } @@ -2389,13 +2390,13 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, invalid_param = true; break; } - std::string value(argv[i]); - if (value != "chatml" && value != "llama2") { - fprintf(stderr, "error: chat template can be \"llama2\" or \"chatml\", but got: %s\n", value.c_str()); + if (!verify_custom_template(argv[i])) { + fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]); + fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n"); invalid_param = true; break; } - sparams.chat_template = value; + sparams.chat_template = argv[i]; } else if (arg == "--override-kv") { @@ -2913,7 +2914,7 @@ int main(int argc, char **argv) if (!validate_api_key(req, res)) { return; } - json data = oaicompat_completion_params_parse(json::parse(req.body), sparams.chat_template); + json data = oaicompat_completion_params_parse(llama.model, json::parse(req.body), sparams.chat_template); const int task_id = llama.queue_tasks.get_new_id(); llama.queue_results.add_waiting_task_id(task_id); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 0ee670dba..e954fb0ef 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -167,50 +167,47 @@ static T json_value(const json &body, const std::string &key, const T &default_v : default_value; } -inline std::string format_llama2(std::vector messages) -{ - std::ostringstream output; - bool is_inside_turn = false; - - for (auto it = messages.begin(); it != messages.end(); ++it) { - if (!is_inside_turn) { - output << "[INST] "; - } - std::string role = json_value(*it, "role", std::string("user")); - std::string content = json_value(*it, "content", std::string("")); - if (role == "system") { - output << "<>\n" << content << "\n<>\n\n"; - is_inside_turn = true; - } else if (role == "user") { - output << content << " [/INST]"; - is_inside_turn = true; - } else { - output << " " << content << " "; - is_inside_turn = false; - } - } - - LOG_VERBOSE("format_llama2", {{"text", output.str()}}); - - return output.str(); +// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid +inline bool verify_custom_template(const std::string & tmpl) { + llama_chat_message chat[] = {{"user", "test"}}; + std::vector buf(1); + int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size()); + return res >= 0; } -inline std::string format_chatml(std::vector messages) +// Format given chat. If tmpl is empty, we take the template from model metadata +inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages) { - std::ostringstream chatml_msgs; + size_t alloc_size = 0; + // vector holding all allocated string to be passed to llama_chat_apply_template + std::vector str(messages.size() * 2); + std::vector chat(messages.size()); - for (auto it = messages.begin(); it != messages.end(); ++it) { - chatml_msgs << "<|im_start|>" - << json_value(*it, "role", std::string("user")) << '\n'; - chatml_msgs << json_value(*it, "content", std::string("")) - << "<|im_end|>\n"; + for (size_t i = 0; i < messages.size(); ++i) { + auto &curr_msg = messages[i]; + str[i*2 + 0] = json_value(curr_msg, "role", std::string("")); + str[i*2 + 1] = json_value(curr_msg, "content", std::string("")); + alloc_size += str[i*2 + 1].length(); + chat[i].role = str[i*2 + 0].c_str(); + chat[i].content = str[i*2 + 1].c_str(); } - chatml_msgs << "<|im_start|>assistant" << '\n'; + const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str(); + std::vector buf(alloc_size * 2); - LOG_VERBOSE("format_chatml", {{"text", chatml_msgs.str()}}); + // run the first time to get the total output length + int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size()); - return chatml_msgs.str(); + // if it turns out that our buffer is too small, we resize it + if ((size_t) res > buf.size()) { + buf.resize(res); + res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size()); + } + + std::string formatted_chat(buf.data(), res); + LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}}); + + return formatted_chat; } // diff --git a/llama.cpp b/llama.cpp index 5de07dfa9..4296eca32 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12602,7 +12602,7 @@ LLAMA_API int32_t llama_chat_apply_template( // load template from model std::vector model_template(2048, 0); // longest known template is about 1200 bytes std::string template_key = "tokenizer.chat_template"; - int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), curr_tmpl.size()); + int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); if (res < 0) { // worst case: there is no information about template, we will use chatml by default curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal From 4ed8e4fbef6a15afd993bfcd9ffa279841e18ef1 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 20 Feb 2024 18:30:27 +0100 Subject: [PATCH 15/17] llava : add explicit instructions for llava-1.6 (#5611) This commit contains a suggestion for the README.md in the llava example. The suggestion adds explicit instructions for how to convert a llava-1.6 model and run it using llava-cli. The motivation for this is that having explicit instructions similar to the 1.5 instructions will make it easier for users to try this out. Signed-off-by: Daniel Bevenius --- examples/llava/README.md | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/examples/llava/README.md b/examples/llava/README.md index e42db6e5a..25ea96715 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -59,14 +59,40 @@ python ./convert.py ../llava-v1.5-7b --skip-unknown Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory. ## LLaVA 1.6 gguf conversion - -1) Backup your pth/safetensor model files as llava-surgery modifies them -2) Use `python llava-surgery-v2.py -C -m /path/to/hf-model` which also supports llava-1.5 variants pytorch as well as safetensor models: +1) First clone a LLaVA 1.6 model: +```console +git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b +``` +2) Backup your pth/safetensor model files as llava-surgery modifies them +3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: +```console +python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/ +``` - you will find a llava.projector and a llava.clip file in your model directory -3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory (https://huggingface.co/cmp-nct/llava-1.6-gguf/blob/main/config_vit.json) and rename it to config.json. -4) Create the visual gguf model: `python ./examples/llava/convert-image-encoder-to-gguf.py -m ../path/to/vit --llava-projector ../path/to/llava.projector --output-dir ../path/to/output --clip-model-is-vision` +4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory: +```console +mkdir vit +cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin +cp ../llava-v1.6-vicuna-7b/llava.projector vit/ +curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json +``` + +5) Create the visual gguf model: +```console +python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision +``` - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP -5) Everything else as usual: convert.py the hf model, quantize as needed + +6) Then convert the model to gguf format: +```console +python ./convert.py ../llava-v1.6-vicuna-7b/ +``` + +7) And finally we can run the llava-cli using the 1.6 model version: +```console +./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 +``` + **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096) **note** llava-1.6 greatly benefits from batched prompt processing (defaults work) From 06bf2cf8c406e6b70dbf9b431a02fa0ad845b9df Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 20 Feb 2024 20:06:17 +0100 Subject: [PATCH 16/17] make : fix debug build with CUDA (#5616) --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 211a08d7f..41c79c135 100644 --- a/Makefile +++ b/Makefile @@ -173,7 +173,7 @@ ifdef LLAMA_DEBUG MK_LDFLAGS += -g ifeq ($(UNAME_S),Linux) - MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS + MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS endif else MK_CPPFLAGS += -DNDEBUG From 6560bed3f066c876682464762cad90f1e28e3f1b Mon Sep 17 00:00:00 2001 From: CJ Pais Date: Tue, 20 Feb 2024 11:07:22 -0800 Subject: [PATCH 17/17] server : support llava 1.6 (#5553) * server: init working 1.6 * move clip_image to header * remove commented code * remove c++ style from header * remove todo * expose llava_image_embed_make_with_clip_img * fix zig build --- Makefile | 2 +- build.zig | 3 ++- examples/llava/llava.cpp | 2 +- examples/llava/llava.h | 2 ++ examples/server/server.cpp | 36 +++--------------------------------- 5 files changed, 9 insertions(+), 36 deletions(-) diff --git a/Makefile b/Makefile index 41c79c135..f03faf6ed 100644 --- a/Makefile +++ b/Makefile @@ -719,7 +719,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2) diff --git a/build.zig b/build.zig index 699738f3d..c0af454dc 100644 --- a/build.zig +++ b/build.zig @@ -123,6 +123,7 @@ pub fn build(b: *std.build.Builder) !void { const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp"); const train = make.obj("train", "common/train.cpp"); const clip = make.obj("clip", "examples/llava/clip.cpp"); + const llava = make.obj("llava", "examples/llava/llava.cpp"); _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser }); _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo }); @@ -131,7 +132,7 @@ pub fn build(b: *std.build.Builder) !void { _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train }); _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train }); - const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip }); + const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava }); if (server.target.isWindows()) { server.linkSystemLibrary("ws2_32"); } diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 4cb65a07b..1a1cf7c78 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -311,7 +311,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * return true; } -static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { +bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model if (!image_embd) { fprintf(stderr, "Unable to allocate memory for image embeddings\n"); diff --git a/examples/llava/llava.h b/examples/llava/llava.h index 9e9466a5d..2d40f3f1d 100644 --- a/examples/llava/llava.h +++ b/examples/llava/llava.h @@ -31,6 +31,8 @@ struct llava_image_embed { /** sanity check for clip <-> llava embed size match */ LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip); +LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); + /** build an image embed from image file bytes */ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); /** build an image embed from a path to an image filename */ diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c7821eca6..eb01729fa 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -5,6 +5,7 @@ #include "oai.hpp" #include "../llava/clip.h" +#include "../llava/llava.h" #include "stb_image.h" @@ -997,43 +998,12 @@ struct llama_server_context { continue; } - clip_image_f32_batch img_res_v; - img_res_v.size = 0; - img_res_v.data = nullptr; - if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v)) - { - LOG_TEE("Error processing the given image"); - clip_free(clp_ctx); - clip_image_f32_batch_free(img_res_v); - return false; - } - if (img_res_v.size == 0) - { + + if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) { LOG_TEE("Error processing the given image"); return false; } - // note: assumes only one image was returned by clip_image_preprocess - clip_image_f32 * img_res = img_res_v.data; - - img.image_tokens = clip_n_patches(clp_ctx); - img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx)); - if (!img.image_embedding) - { - LOG_TEE("Unable to allocate memory for image embeddings\n"); - clip_image_f32_batch_free(img_res_v); - clip_free(clp_ctx); - return false; - } - LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id); - if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding)) - { - LOG_TEE("Unable to encode image\n"); - clip_image_f32_batch_free(img_res_v); - return false; - } - - clip_image_f32_batch_free(img_res_v); img.request_encode_image = false; }