From b685daf3867c54e42a9db484d7b92619021d4510 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 10 Dec 2024 14:23:17 -0600 Subject: [PATCH 001/380] vulkan: request round-to-even for fp16 in im2col/rope_head (#10767) Vulkan doesn't mandate a specific rounding mode, but the shader_float_controls feature allows rounding mode to be requested if the implementation supports it. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 23 +++++++++++++++---- .../ggml-vulkan/vulkan-shaders/im2col.comp | 5 ++++ .../ggml-vulkan/vulkan-shaders/rope_head.comp | 5 ++++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 3 +++ 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index bb9572d76..a8ae58ee2 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -162,6 +162,7 @@ struct vk_device_struct { uint32_t subgroup_size; uint32_t shader_core_count; bool uma; + bool float_controls_rte_fp16; bool coopmat2; bool coopmat_support; @@ -1916,17 +1917,26 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + + if (device->float_controls_rte_fp16) { + ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_rte_len, rope_norm_f16_rte_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_rte_len, rope_neox_f16_rte_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + } else { + ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + } ggml_vk_create_pipeline(device, device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); + if (device->float_controls_rte_fp16) { + ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); + } else { + ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); + } ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1); @@ -2007,11 +2017,13 @@ static vk_device ggml_vk_get_device(size_t idx) { vk::PhysicalDeviceDriverProperties driver_props; vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props; vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props; + vk::PhysicalDeviceVulkan12Properties vk12_props; props2.pNext = &props3; props3.pNext = &subgroup_props; subgroup_props.pNext = &driver_props; + driver_props.pNext = &vk12_props; - VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&driver_props; + VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_props; if (maintenance4_support) { last_struct->pNext = (VkBaseOutStructure *)&props4; @@ -2057,6 +2069,7 @@ static vk_device ggml_vk_get_device(size_t idx) { } else { device->shader_core_count = 0; } + device->float_controls_rte_fp16 = vk12_props.shaderRoundingModeRTEFloat16; const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp index 4d48610a3..966fedf8f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp @@ -1,6 +1,11 @@ #version 450 #extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_spirv_intrinsics: enable + +#if RTE16 +spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits +#endif layout (push_constant) uniform parameter { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp index ea8954226..574b51ca5 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp @@ -1,6 +1,11 @@ #include "types.comp" #extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_spirv_intrinsics: enable + +#if RTE16 +spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits +#endif layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index ee3520833..c48a228ae 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -461,9 +461,11 @@ void process_shaders() { string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}}); string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("rope_neox_f16_rte", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}}); string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}}); @@ -471,6 +473,7 @@ void process_shaders() { string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}})); + string_to_spv("im2col_f32_f16_rte", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}})); string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); From 43041d2eb32623d5b6ca734313327abe96f73146 Mon Sep 17 00:00:00 2001 From: "Gilad S." <7817232+giladgd@users.noreply.github.com> Date: Wed, 11 Dec 2024 02:47:21 +0200 Subject: [PATCH 002/380] ggml: load all backends from a user-provided search path (#10699) * feat: load all backends from a user-provided search path * fix: Windows search path * refactor: rename `ggml_backend_load_all_in_search_path` to `ggml_backend_load_all_from_path` * refactor: rename `search_path` to `dir_path` * fix: change `NULL` to `nullptr` Co-authored-by: Diego Devesa * fix: change `NULL` to `nullptr` --------- Co-authored-by: Diego Devesa --- ggml/include/ggml-backend.h | 1 + ggml/src/ggml-backend-reg.cpp | 40 +++++++++++++++++++++++------------ 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 19881a505..7221a0830 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -228,6 +228,7 @@ extern "C" { GGML_API void ggml_backend_unload(ggml_backend_reg_t reg); // Load all known backends from dynamic libraries GGML_API void ggml_backend_load_all(void); + GGML_API void ggml_backend_load_all_from_path(const char * dir_path); // // Backend scheduler diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 5cb0fb9d1..2e7340145 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -449,11 +449,21 @@ static std::string backend_filename_suffix() { #endif } -static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) { +static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) { // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths // TODO: search system paths - std::vector search_paths = { "./", get_executable_path() }; std::string file_prefix = backend_filename_prefix() + name + "-"; + std::vector search_paths; + if (user_search_path == nullptr) { + search_paths.push_back("./"); + search_paths.push_back(get_executable_path()); + } else { +#if defined(_WIN32) + search_paths.push_back(std::string(user_search_path) + "\\"); +#else + search_paths.push_back(std::string(user_search_path) + "/"); +#endif + } int best_score = 0; std::string best_path; @@ -509,21 +519,25 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) } void ggml_backend_load_all() { + ggml_backend_load_all_from_path(nullptr); +} + +void ggml_backend_load_all_from_path(const char * dir_path) { #ifdef NDEBUG bool silent = true; #else bool silent = false; #endif - ggml_backend_load_best("blas", silent); - ggml_backend_load_best("cann", silent); - ggml_backend_load_best("cuda", silent); - ggml_backend_load_best("hip", silent); - ggml_backend_load_best("kompute", silent); - ggml_backend_load_best("metal", silent); - ggml_backend_load_best("rpc", silent); - ggml_backend_load_best("sycl", silent); - ggml_backend_load_best("vulkan", silent); - ggml_backend_load_best("musa", silent); - ggml_backend_load_best("cpu", silent); + ggml_backend_load_best("blas", silent, dir_path); + ggml_backend_load_best("cann", silent, dir_path); + ggml_backend_load_best("cuda", silent, dir_path); + ggml_backend_load_best("hip", silent, dir_path); + ggml_backend_load_best("kompute", silent, dir_path); + ggml_backend_load_best("metal", silent, dir_path); + ggml_backend_load_best("rpc", silent, dir_path); + ggml_backend_load_best("sycl", silent, dir_path); + ggml_backend_load_best("vulkan", silent, dir_path); + ggml_backend_load_best("musa", silent, dir_path); + ggml_backend_load_best("cpu", silent, dir_path); } From 4b4d92b0986e3b627b9a9ef4782973108bf47691 Mon Sep 17 00:00:00 2001 From: CentricStorm Date: Wed, 11 Dec 2024 10:47:43 +0000 Subject: [PATCH 003/380] docs: fix server documentation formatting (#10776) --- examples/server/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/README.md b/examples/server/README.md index 6294f541f..2bd23d1a6 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -442,7 +442,7 @@ These words will not be included in the completion, so make sure to add them to `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values. - `timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` +`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` **Response format** From 484d2f31aed34ff9f096e3961125762e81d9b7d6 Mon Sep 17 00:00:00 2001 From: kallewoof Date: Wed, 11 Dec 2024 22:48:04 +0900 Subject: [PATCH 004/380] bug-fix: snprintf prints NULL in place of the last character (#10419) * bug-fix: snprintf prints NULL in place of the last character We need to give snprintf enough space to print the last character and the null character, thus we allocate one extra byte and then ignore it when converting to std::string. * add comment about extra null-term byte requirement --- examples/server/utils.hpp | 2 +- include/llama.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 8f545aea5..2fcb895ab 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -333,7 +333,7 @@ static std::string llama_get_chat_template(const struct llama_model * model) { if (res < 2) { return ""; } else { - std::vector model_template(res, 0); + std::vector model_template(res + 1, 0); llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); return std::string(model_template.data(), model_template.size() - 1); } diff --git a/include/llama.h b/include/llama.h index 36945cde3..eebbacb80 100644 --- a/include/llama.h +++ b/include/llama.h @@ -456,6 +456,7 @@ extern "C" { // Functions to access the model's GGUF metadata scalar values // - The functions return the length of the string on success, or -1 on failure // - The output string is always null-terminated and cleared on failure + // - When retrieving a string, an extra byte must be allocated to account for the null terminator // - GGUF array values are not supported by these functions // Get metadata value as a string by key name From 92f77a640f763c0af73554fb810a85a7d4c85e5e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 11 Dec 2024 14:59:41 +0100 Subject: [PATCH 005/380] ci : pin nodejs to 22.11.0 (#10779) --- .github/workflows/server.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 9e66fb68c..671fe595c 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -79,7 +79,7 @@ jobs: # Setup nodejs (to be used for verifying bundled index.html) - uses: actions/setup-node@v4 with: - node-version: 22 + node-version: '22.11.0' - name: Verify bundled index.html id: verify_server_index_html From 1a31d0dc00ba946d448e16ecc915ce5e8355994e Mon Sep 17 00:00:00 2001 From: qingy1337 Date: Wed, 11 Dec 2024 07:16:32 -0800 Subject: [PATCH 006/380] Update README.md (#10772) --- examples/quantize/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/quantize/README.md b/examples/quantize/README.md index 5d1e11c67..f9cce7b21 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -81,7 +81,7 @@ Several quantization methods are supported. They differ in the resulting model d - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930) - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957) - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969) - - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996) + - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996) - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060) - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196) - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361) From 235f6e14bf0ed0211c51aeff14139038ae1000aa Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 11 Dec 2024 20:52:14 +0100 Subject: [PATCH 007/380] server : (UI) add tok/s, get rid of completion.js (#10786) * get rid of completion.js * extract chat bubble to a component * add tok/s info * sync * fix BASE_URL * only extract timings when it's enabled * fix auto scroll --- examples/server/public/index.html | 198 ++++++++++++--------- examples/server/webui/index.html | 124 ++++++++----- examples/server/webui/package-lock.json | 7 + examples/server/webui/package.json | 1 + examples/server/webui/src/completion.js | 225 ------------------------ examples/server/webui/src/main.js | 128 +++++++++++--- 6 files changed, 307 insertions(+), 376 deletions(-) delete mode 100644 examples/server/webui/src/completion.js diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 250729a44..9a19c5e83 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -11,84 +11,84 @@ 🦙 llama.cpp - chat - - + @@ -99,7 +99,7 @@ Server rendered element contains fewer child nodes than client vdom.`),T=!0),au(
-
+

Conversations

@@ -204,51 +204,25 @@ Server rendered element contains fewer child nodes than client vdom.`),T=!0),au( {{ messages.length === 0 ? 'Send a message to start' : '' }}
-
-
- - - - -
-
- - -
- - - - - -
+
-
-
- - -
+
+
@@ -311,6 +285,10 @@ Server rendered element contains fewer child nodes than client vdom.`),T=!0),au(
Advanced config
+
+ + Show tokens per second +
+ + + + +