From 44fbe34360dd760f9e68b4271f21533436397f84 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Fri, 9 Feb 2024 06:52:33 +0100 Subject: [PATCH 01/20] Fix Vulkan crash on APUs with very little device memory (#5424) * Fix Vulkan crash on APUs with very little device memory * Fix debug output function names --- ggml-vulkan.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 9e2846ee4..254f648a6 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -744,6 +744,8 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz } if (memory_type_index >= mem_props.memoryTypeCount) { + ctx->device.lock()->device.destroyBuffer(buf->buffer); + buf->size = 0; throw vk::OutOfDeviceMemoryError("No suitable memory type found"); } @@ -3875,7 +3877,7 @@ static ggml_tensor * ggml_vk_find_last_use(const ggml_tensor * node, ggml_cgraph static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){ #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_ctx->preallocate_buffers_graph(" << node << ")" << std::endl; + std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl; #endif const bool any_on_device = node->backend == GGML_BACKEND_GPU || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) @@ -3994,8 +3996,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { return; } #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_ctx->preallocate_buffers()" << std::endl; - std::cerr << "qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << std::endl; + std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl; #endif #if defined(GGML_VULKAN_RUN_TESTS) ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); From b2f87cb64db47d799b6f3656855c9caf9792ab2a Mon Sep 17 00:00:00 2001 From: Michael Podvitskiy Date: Fri, 9 Feb 2024 10:56:43 +0100 Subject: [PATCH 02/20] ggml : fix `error C2078: too many initializers` for MSVC ARM64 (#5404) --- ggml-quants.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index 101d3e783..1031e3761 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -268,6 +268,17 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) #if defined(__ARM_NEON) + +#ifdef _MSC_VER + +#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } + +#else + +#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } + +#endif + #if !defined(__aarch64__) // 64-bit compatibility @@ -8698,10 +8709,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * res for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { q8b = ggml_vld1q_s8_x4(q8); q8 += 64; memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t); - const uint32x4_t aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]}; - const uint32x4_t aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]}; - const uint32x4_t aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]}; - const uint32x4_t aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]}; + const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]); + const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]); + const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]); + const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]); q3 += 16; q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127)))); q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127)))); From e4124c24775f2cb5b3d7acc93bf9dc5471c172ef Mon Sep 17 00:00:00 2001 From: Marko Tasic Date: Fri, 9 Feb 2024 11:17:00 +0100 Subject: [PATCH 03/20] readme : add JavaScript/Wasm repo (#5415) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 66166c01b..0b4efdd33 100644 --- a/README.md +++ b/README.md @@ -124,6 +124,7 @@ Typically finetunes of the base models below are supported as well. - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp) - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp) +- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) From e5ca3937c685d6e012ac4db40555d6ec100ff03c Mon Sep 17 00:00:00 2001 From: Paul Tsochantaris Date: Fri, 9 Feb 2024 10:48:06 +0000 Subject: [PATCH 04/20] llama : do not cap thread count when MoE on CPU (#5419) * Not capping thread count when MoE inference is running on CPU * Whitespace --- llama.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index db7d1c1cd..0566b087b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7285,7 +7285,9 @@ static int llama_decode_internal( // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering // with the BLAS calls. need a better solution - if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { + // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is + // being processed then Accelerate/BLAS will not be involved, so capping would limit performance. + if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { n_threads = std::min(4, n_threads); } From 7c777fcd5dd4af7079e33390cf6a19c328a2666f Mon Sep 17 00:00:00 2001 From: Riley Stewart Date: Fri, 9 Feb 2024 02:49:49 -0800 Subject: [PATCH 05/20] server : fix prompt caching for repeated prompts (#5420) --- examples/server/server.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index eceda30d0..8d668f798 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1592,10 +1592,6 @@ struct llama_server_context LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed); } - LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); - - llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); - slot.cache_tokens = prompt_tokens; if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) @@ -1609,6 +1605,10 @@ struct llama_server_context } } + LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); + + llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); + LOG_VERBOSE("prompt ingested", { {"n_past", slot.n_past}, {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)}, From e00d2a62dd1441e3b089570ec06d05c18800d368 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 9 Feb 2024 14:00:59 +0100 Subject: [PATCH 06/20] llava : add requirements.txt and update README.md (#5428) * llava: add requirements.txt and update README.md This commit adds a `requirements.txt` file to the `examples/llava` directory. This file contains the required Python packages to run the scripts in the `examples/llava` directory. The motivation of this to make it easier for users to run the scripts in `examples/llava`. This will avoid users from having to possibly run into missing package issues if the packages are not installed on their system. Signed-off-by: Daniel Bevenius * llava: fix typo in llava-surgery.py output Signed-off-by: Daniel Bevenius --------- Signed-off-by: Daniel Bevenius --- examples/llava/README.md | 12 +++++++++--- examples/llava/llava-surgery.py | 2 +- examples/llava/requirements.txt | 3 +++ 3 files changed, 13 insertions(+), 4 deletions(-) create mode 100644 examples/llava/requirements.txt diff --git a/examples/llava/README.md b/examples/llava/README.md index 721d5e613..19f1a50a2 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -29,19 +29,25 @@ git clone https://huggingface.co/liuhaotian/llava-v1.5-7b git clone https://huggingface.co/openai/clip-vit-large-patch14-336 ``` -2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: +2. Install the required Python packages: + +```sh +pip install -r examples/llava/requirements.txt +``` + +3. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: ```sh python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b ``` -3. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF: +4. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF: ```sh python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b ``` -4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: +5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: ```sh python ./convert.py ../llava-v1.5-7b diff --git a/examples/llava/llava-surgery.py b/examples/llava/llava-surgery.py index 515f6b58d..0a61efdfe 100644 --- a/examples/llava/llava-surgery.py +++ b/examples/llava/llava-surgery.py @@ -42,5 +42,5 @@ if len(clip_tensors) > 0: torch.save(checkpoint, path) print("Done!") -print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") +print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt new file mode 100644 index 000000000..f80f727a7 --- /dev/null +++ b/examples/llava/requirements.txt @@ -0,0 +1,3 @@ +-r ../../requirements/requirements-convert.txt +pillow~=10.2.0 +torch~=2.1.1 From 4b7b38bef5addbd31f453871d79647fbae6bec8a Mon Sep 17 00:00:00 2001 From: Neuman Vong Date: Sat, 10 Feb 2024 05:30:19 +1100 Subject: [PATCH 07/20] vulkan: Set limit for task concurrency (#5427) A common default for the maximum number of open files is 256, which can lead to `asyncio.gather(*tasks)` failing with Too many open files. $ python ggml_vk_generate_shaders.py --glslc=$ANDROID_NDK_PATH/shader-tools/darwin-x86_64/glslc ggml_vulkan: Generating and compiling shaders to SPIR-V Traceback (most recent call last): File "/Users/neuman/Code.noindex/github/llama.cpp/ggml_vk_generate_shaders.py", line 2326, in asyncio.run(main()) File "/Users/neuman/Code.noindex/miniforge3/lib/python3.10/asyncio/runners.py", line 44, in run return loop.run_until_complete(main) File "/Users/neuman/Code.noindex/miniforge3/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete return future.result() File "/Users/neuman/Code.noindex/github/llama.cpp/ggml_vk_generate_shaders.py", line 2294, in main await asyncio.gather(*tasks) [...snip...] OSError: [Errno 24] Too many open files This change sets a reasonable concurrency limit for tasks (and therefore open files), without significant impact on run time. --- ggml_vk_generate_shaders.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ggml_vk_generate_shaders.py b/ggml_vk_generate_shaders.py index 4abb0383f..b2e86e182 100644 --- a/ggml_vk_generate_shaders.py +++ b/ggml_vk_generate_shaders.py @@ -2067,6 +2067,8 @@ type_names = { K_QUANTS_PER_ITERATION = 2 +ASYNCIO_CONCURRENCY = 64 + output_dir = gettempdir() lock = asyncio.Lock() @@ -2291,7 +2293,14 @@ async def main(): tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"})) tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"})) - await asyncio.gather(*tasks) + # Helper to decorate tasks with semaphore acquisition. + async def withSemaphore(sem, task): + async with sem: + return await task + + # Run tasks concurrently guarded by a concurrency limit. + sem = asyncio.Semaphore(ASYNCIO_CONCURRENCY) + await asyncio.gather(*(withSemaphore(sem, task) for task in tasks)) with open("ggml-vulkan-shaders.hpp", "w") as f: f.write("#include \n\n") From 4633d93af08d890ecd00fa6e4f61d76f21cded4c Mon Sep 17 00:00:00 2001 From: Michael Podvitskiy Date: Fri, 9 Feb 2024 10:42:27 +0100 Subject: [PATCH 08/20] ggml : add abort_callback for cpu backend (ggml/725) * a way to use abort_callback with the cpu backend * whisper update --- ggml-backend.c | 26 ++++++++++++++++++++++---- ggml-backend.h | 5 +++-- ggml.c | 2 +- ggml.h | 9 +++++++-- 4 files changed, 33 insertions(+), 9 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index 0764dfebc..532da8eda 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -653,6 +653,9 @@ struct ggml_backend_cpu_context { int n_threads; void * work_data; size_t work_size; + + ggml_abort_callback abort_callback; + void * abort_callback_data; }; GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) { @@ -691,6 +694,9 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); } + cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback; + cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data; + return cpu_plan; } @@ -721,9 +727,11 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size); cpu_ctx->work_size = cplan.work_size; } - cplan.work_data = cpu_ctx->work_data; + cplan.abort_callback = cpu_ctx->abort_callback; + cplan.abort_callback_data = cpu_ctx->abort_callback_data; + ggml_graph_compute(cgraph, &cplan); return true; } @@ -759,9 +767,11 @@ static struct ggml_backend_i cpu_backend_i = { ggml_backend_t ggml_backend_cpu_init(void) { struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context)); - ctx->n_threads = GGML_DEFAULT_N_THREADS; - ctx->work_data = NULL; - ctx->work_size = 0; + ctx->n_threads = GGML_DEFAULT_N_THREADS; + ctx->work_data = NULL; + ctx->work_size = 0; + ctx->abort_callback = NULL; + ctx->abort_callback_data = NULL; ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend)); @@ -783,6 +793,14 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { ctx->n_threads = n_threads; } +void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) { + GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); + + struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; + ctx->abort_callback = abort_callback; + ctx->abort_callback_data = abort_callback_data; +} + GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size); } diff --git a/ggml-backend.h b/ggml-backend.h index 8b8160fcf..282b3a9b7 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -83,8 +83,9 @@ extern "C" { GGML_API ggml_backend_t ggml_backend_cpu_init(void); - GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); - GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); + GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); + GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); + GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); // Create a backend buffer from an existing pointer GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); diff --git a/ggml.c b/ggml.c index f783a6fd3..86cd65862 100644 --- a/ggml.c +++ b/ggml.c @@ -16649,7 +16649,7 @@ struct ggml_compute_state_shared { atomic_int node_n; // active graph node atomic_int node_task; // active graph node task phase - bool (*abort_callback)(void * data); // abort ggml_graph_compute when true + ggml_abort_callback abort_callback; // abort ggml_graph_compute when true void * abort_callback_data; }; diff --git a/ggml.h b/ggml.h index e0a4799f3..1360cd8ee 100644 --- a/ggml.h +++ b/ggml.h @@ -567,6 +567,11 @@ extern "C" { static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); + // Abort callback + // If not NULL, called before ggml computation + // If it returns true, the computation is aborted + typedef bool (*ggml_abort_callback)(void * data); + // the compute plan that needs to be prepared for ggml_graph_compute() // since https://github.com/ggerganov/ggml/issues/287 struct ggml_cplan { @@ -576,8 +581,8 @@ extern "C" { int n_threads; // abort ggml_graph_compute when true - bool (*abort_callback)(void * data); - void * abort_callback_data; + ggml_abort_callback abort_callback; + void * abort_callback_data; }; enum ggml_cgraph_eval_order { From 43b65f5eb85e8741aba573a8f65bb8efad245d31 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 10 Feb 2024 09:30:36 +0200 Subject: [PATCH 09/20] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 7b6c17915..6ae75bc31 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -475cbad5c1c834e31e26a2283bc1413181644360 +2c7cf49810d523b9632da393a9e8270b60bf3b24 From cd9aea63b577a83def84dbd6dcd90a6fa02af745 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 10 Feb 2024 09:53:05 +0200 Subject: [PATCH 10/20] scripts : update sync scripts with new backends --- scripts/sync-ggml-am.sh | 12 ++++++++++++ scripts/sync-ggml.sh | 6 ++++++ 2 files changed, 18 insertions(+) diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index 6b2514a11..2c391e641 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -97,6 +97,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # src/ggml-cuda.cu -> ggml-cuda.cu # src/ggml-cuda.h -> ggml-cuda.h # src/ggml-impl.h -> ggml-impl.h + # src/ggml-kompute.cpp -> ggml-kompute.cpp + # src/ggml-kompute.h -> ggml-kompute.h # src/ggml-metal.h -> ggml-metal.h # src/ggml-metal.m -> ggml-metal.m # src/ggml-mpi.h -> ggml-mpi.h @@ -105,6 +107,10 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # src/ggml-opencl.h -> ggml-opencl.h # src/ggml-quants.c -> ggml-quants.c # src/ggml-quants.h -> ggml-quants.h + # src/ggml-sycl.cpp -> ggml-sycl.cpp + # src/ggml-sycl.h -> ggml-sycl.h + # src/ggml-vulkan.cpp -> ggml-vulkan.cpp + # src/ggml-vulkan.h -> ggml-vulkan.h # include/ggml/ggml.h -> ggml.h # include/ggml/ggml-alloc.h -> ggml-alloc.h # include/ggml/ggml-backend.h -> ggml-backend.h @@ -123,6 +129,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \ -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \ -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \ + -e 's/src\/ggml-kompute\.cpp/ggml-kompute.cpp/g' \ + -e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \ -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \ -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \ -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \ @@ -131,6 +139,10 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \ -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \ -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \ + -e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \ + -e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \ + -e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \ + -e 's/src\/ggml-vulkan\.h/ggml-vulkan.h/g' \ -e 's/include\/ggml\/ggml\.h/ggml.h/g' \ -e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \ -e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \ diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index 0097db435..feb34bbc8 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -7,6 +7,8 @@ cp -rpv ../ggml/src/ggml-backend.c ./ggml-backend.c cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h cp -rpv ../ggml/src/ggml-impl.h ./ggml-impl.h +cp -rpv ../ggml/src/ggml-kompute.cpp ./ggml-kompute.cpp +cp -rpv ../ggml/src/ggml-kompute.h ./ggml-kompute.h cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal @@ -16,6 +18,10 @@ cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c cp -rpv ../ggml/src/ggml-quants.h ./ggml-quants.h +cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml-sycl.cpp +cp -rpv ../ggml/src/ggml-sycl.h ./ggml-sycl.h +cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml-vulkan.cpp +cp -rpv ../ggml/src/ggml-vulkan.h ./ggml-vulkan.h cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h From f026f8120f97090d34a52b3dc023c82e0ede3f7d Mon Sep 17 00:00:00 2001 From: Ian Bull Date: Sat, 10 Feb 2024 02:53:28 -0800 Subject: [PATCH 11/20] metal : use autoreleasepool to avoid memory leaks (#5437) There appears to be a known memory leak when using the `MLTCommandBuffer`. It is suggested to use `@autoreleasepool` in [1,2] [1] https://developer.apple.com/forums/thread/662721 [2] https://forums.developer.apple.com/forums/thread/120931 This change-set wraps the `ggml_metal_graph_compute` in a `@autoreleasepool`. This commit addresses https://github.com/ggerganov/llama.cpp/issues/5436 --- ggml-metal.m | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml-metal.m b/ggml-metal.m index 5260ed827..c1d8e2de8 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -687,6 +687,7 @@ static bool ggml_metal_graph_compute( struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { + @autoreleasepool { MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor; edesc.dispatchType = MTLDispatchTypeSerial; @@ -2272,6 +2273,7 @@ static bool ggml_metal_graph_compute( [[MTLCaptureManager sharedCaptureManager] stopCapture]; } + } return true; } From 907e08c1109f498b01036367804cff3082c44524 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 11 Feb 2024 11:16:22 +0100 Subject: [PATCH 12/20] server : add llama2 chat template (#5425) * server: add mistral chat template * server: fix typo * server: rename template mistral to llama2 * server: format_llama2: remove BOS * server: validate "--chat-template" argument * server: clean up using_chatml variable Co-authored-by: Jared Van Bortel --------- Co-authored-by: Jared Van Bortel --- examples/server/oai.hpp | 8 ++++++-- examples/server/server.cpp | 22 ++++++++++++++++++++-- examples/server/utils.hpp | 30 ++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 4 deletions(-) diff --git a/examples/server/oai.hpp b/examples/server/oai.hpp index 43410f803..2eca8a9fb 100644 --- a/examples/server/oai.hpp +++ b/examples/server/oai.hpp @@ -15,9 +15,13 @@ using json = nlohmann::json; inline static json oaicompat_completion_params_parse( - const json &body /* openai api json semantics */) + const json &body, /* openai api json semantics */ + const std::string &chat_template) { json llama_params; + std::string formatted_prompt = chat_template == "chatml" + ? format_chatml(body["messages"]) // OpenAI 'messages' to chatml (with <|im_start|>,...) + : format_llama2(body["messages"]); // OpenAI 'messages' to llama2 (with [INST],...) llama_params["__oaicompat"] = true; @@ -30,7 +34,7 @@ inline static json oaicompat_completion_params_parse( // https://platform.openai.com/docs/api-reference/chat/create llama_sampling_params default_sparams; llama_params["model"] = json_value(body, "model", std::string("unknown")); - llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' + llama_params["prompt"] = formatted_prompt; llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); llama_params["temperature"] = json_value(body, "temperature", 0.0); llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8d668f798..4d212f1f0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -36,6 +36,7 @@ struct server_params std::string hostname = "127.0.0.1"; std::vector api_keys; std::string public_path = "examples/server/public"; + std::string chat_template = "chatml"; int32_t port = 8080; int32_t read_timeout = 600; int32_t write_timeout = 600; @@ -1859,6 +1860,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`"); printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`"); + printf(" --chat-template FORMAT_NAME"); + printf(" set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str()); printf("\n"); } @@ -2290,6 +2293,21 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, log_set_target(stdout); LOG_INFO("logging to file is disabled.", {}); } + else if (arg == "--chat-template") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + std::string value(argv[i]); + if (value != "chatml" && value != "llama2") { + fprintf(stderr, "error: chat template can be \"llama2\" or \"chatml\", but got: %s\n", value.c_str()); + invalid_param = true; + break; + } + sparams.chat_template = value; + } else if (arg == "--override-kv") { if (++i >= argc) { @@ -2743,13 +2761,13 @@ int main(int argc, char **argv) // TODO: add mount point without "/v1" prefix -- how? - svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) + svr.Post("/v1/chat/completions", [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); if (!validate_api_key(req, res)) { return; } - json data = oaicompat_completion_params_parse(json::parse(req.body)); + json data = oaicompat_completion_params_parse(json::parse(req.body), sparams.chat_template); const int task_id = llama.queue_tasks.get_new_id(); llama.queue_results.add_waiting_task_id(task_id); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 70cce0721..548548962 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -167,6 +167,34 @@ static T json_value(const json &body, const std::string &key, const T &default_v : default_value; } +inline std::string format_llama2(std::vector messages) +{ + std::ostringstream output; + bool is_inside_turn = false; + + for (auto it = messages.begin(); it != messages.end(); ++it) { + if (!is_inside_turn) { + output << "[INST] "; + } + std::string role = json_value(*it, "role", std::string("user")); + std::string content = json_value(*it, "content", std::string("")); + if (role == "system") { + output << "<>\n" << content << "\n<>\n\n"; + is_inside_turn = true; + } else if (role == "user") { + output << content << " [/INST]"; + is_inside_turn = true; + } else { + output << " " << content << " "; + is_inside_turn = false; + } + } + + LOG_VERBOSE("format_llama2", {{"text", output.str()}}); + + return output.str(); +} + inline std::string format_chatml(std::vector messages) { std::ostringstream chatml_msgs; @@ -180,6 +208,8 @@ inline std::string format_chatml(std::vector messages) chatml_msgs << "<|im_start|>assistant" << '\n'; + LOG_VERBOSE("format_chatml", {{"text", chatml_msgs.str()}}); + return chatml_msgs.str(); } From e4640d8fdf56f14a6db3d092bcd3d2d315cb5d04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sun, 11 Feb 2024 12:44:51 +0100 Subject: [PATCH 13/20] lookup: add print for drafting performance (#5450) --- examples/lookup/lookup.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index d8de7dd38..18235b8a1 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -1,7 +1,9 @@ #include "common.h" +#include "ggml.h" #include "llama.h" #include +#include #include #include #include @@ -73,6 +75,8 @@ int main(int argc, char ** argv){ int n_drafted = 0; int n_accept = 0; + int64_t t_draft_us = 0; + int n_past = inp.size(); bool has_eos = false; @@ -160,7 +164,7 @@ int main(int argc, char ** argv){ // generate n_pred tokens through prompt lookup auto prompt_lookup = [&]() -> void { - int inp_size = inp.size(); + const int inp_size = inp.size(); for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){ const llama_token * ngram = &inp[inp_size - ngram_size]; @@ -191,8 +195,12 @@ int main(int argc, char ** argv){ return; }; + const int64_t t_start_draft_us = ggml_time_us(); + prompt_lookup(); + t_draft_us += ggml_time_us() - t_start_draft_us; + llama_decode(ctx, batch_tgt); ++n_past; @@ -210,6 +218,8 @@ int main(int argc, char ** argv){ LOG_TEE("n_draft = %d\n", n_draft); LOG_TEE("n_predict = %d\n", n_predict); LOG_TEE("n_drafted = %d\n", n_drafted); + LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", + t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us)); LOG_TEE("n_accept = %d\n", n_accept); LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); From a07d0fee1f05c5c1dc49948ae1a3293db017275f Mon Sep 17 00:00:00 2001 From: snadampal <87143774+snadampal@users.noreply.github.com> Date: Sun, 11 Feb 2024 07:22:33 -0600 Subject: [PATCH 14/20] ggml : add mmla kernels for quantized GEMM (#4966) * ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm armv8.2-a and above supports MMLA instructions that have higher throughput than DOT. this commit adds mmla kernel for q8_0_q8_0 gemm. The feature is enabled if the platform supports "__ARM_FEATURE_MATMUL_INT8" On AWS Graviton3 processors this kernel resulted up to 1.5x improvement for prompt evaluation throughput compared to the default sdot kernel. * ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm armv8.2-a and above supports MMLA instructions that have higher throughput than DOT. this commit adds mmla kernel for q4_0_q8_0 gemm. The feature is enabled if the platform supports "__ARM_FEATURE_MATMUL_INT8" On AWS Graviton3 processors this kernel resulted up to 1.5x improvement for prompt evaluation throughput compared to the default sdot kernel. * ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm armv8.2-a and above supports MMLA instructions that have higher throughput than DOT. this commit adds mmla kernel for q4_1_q8_1 gemm. The feature is enabled if the platform supports "__ARM_FEATURE_MATMUL_INT8" On AWS Graviton3 processors this kernel resulted up to 1.5x improvement for prompt evaluation throughput compared to the default sdot kernel. * ggml: update unit tests for the new vec_dot interface * llama.cpp: add MATMUL_INT8 capability to system_info --- common/common.cpp | 1 + ggml-quants.c | 320 +++++++++++++++++++++++++++++++++-- ggml-quants.h | 26 +-- ggml.c | 164 ++++++++++++------ ggml.h | 5 +- llama.cpp | 1 + pocs/vdot/q8dot.cpp | 4 +- pocs/vdot/vdot.cpp | 4 +- tests/test-quantize-fns.cpp | 2 +- tests/test-quantize-perf.cpp | 2 +- 10 files changed, 441 insertions(+), 88 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index e0082a823..9a489a553 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1550,6 +1550,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); + fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false"); #ifdef NDEBUG fprintf(stream, "debug: false\n"); diff --git a/ggml-quants.c b/ggml-quants.c index 1031e3761..6c122dd2a 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -49,6 +49,8 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define UNUSED GGML_UNUSED + #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) @@ -3677,15 +3679,88 @@ static inline __m128i get_scale_shuffle(int i) { } #endif -void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); +#if defined(__ARM_FEATURE_MATMUL_INT8) + assert((nrc == 2) || (nrc == 1)); +#else + assert(nrc == 1); +#endif const block_q4_0 * restrict x = vx; const block_q8_0 * restrict y = vy; +#if defined(__ARM_FEATURE_MATMUL_INT8) + if (nrc == 2) { + const block_q4_0 * restrict vx0 = vx; + const block_q4_0 * restrict vx1 = vx + bx; + + const block_q8_0 * restrict vy0 = vy; + const block_q8_0 * restrict vy1 = vy + by; + + float32x4_t sumv0 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i++) { + const block_q4_0 * restrict b_x0 = &vx0[i]; + const block_q4_0 * restrict b_x1 = &vx1[i]; + const block_q8_0 * restrict b_y0 = &vy0[i]; + const block_q8_0 * restrict b_y1 = &vy1[i]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + const int8x16_t s8b = vdupq_n_s8(0x8); + + const uint8x16_t v0_0 = vld1q_u8(b_x0->qs); + const uint8x16_t v0_1 = vld1q_u8(b_x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // sub 8 + const int8x16_t x0_l = vsubq_s8(v0_0l, s8b); + const int8x16_t x0_h = vsubq_s8(v0_0h, s8b); + const int8x16_t x1_l = vsubq_s8(v0_1l, s8b); + const int8x16_t x1_h = vsubq_s8(v0_1h, s8b); + + // load y + const int8x16_t y0_l = vld1q_s8(b_y0->qs); + const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16); + const int8x16_t y1_l = vld1q_s8(b_y1->qs); + const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); + + float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), + GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)}; + + int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + + int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + + int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + + int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + + sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), + l1, r1)), l2, r2)), l3, r3))), scale); + } + float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2); + float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); + + vst1_f32(s, vget_low_f32(sumv2)); + vst1_f32(s + bs, vget_high_f32(sumv2)); + return; + } +#endif #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); @@ -3967,15 +4042,89 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, #endif } -void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_1; const int nb = n / qk; assert(n % qk == 0); +#if defined(__ARM_FEATURE_MATMUL_INT8) + assert((nrc == 2) || (nrc == 1)); +#else + assert(nrc == 1); +#endif const block_q4_1 * restrict x = vx; const block_q8_1 * restrict y = vy; +#if defined(__ARM_FEATURE_MATMUL_INT8) + if (nrc == 2) { + const block_q4_1 * restrict vx0 = vx; + const block_q4_1 * restrict vx1 = vx + bx; + const block_q8_1 * restrict vy0 = vy; + const block_q8_1 * restrict vy1 = vy + by; + + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t summs0 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i++) { + const block_q4_1 * restrict b_x0 = &vx0[i]; + const block_q4_1 * restrict b_x1 = &vx1[i]; + const block_q8_1 * restrict b_y0 = &vy0[i]; + const block_q8_1 * restrict b_y1 = &vy1[i]; + + float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * b_y0->s, + GGML_FP16_TO_FP32(b_x1->m) * b_y0->s, + GGML_FP16_TO_FP32(b_x0->m) * b_y1->s, + GGML_FP16_TO_FP32(b_x1->m) * b_y1->s}; + summs0 += summs_t; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + + const uint8x16_t v0_0 = vld1q_u8(b_x0->qs); + const uint8x16_t v0_1 = vld1q_u8(b_x1->qs); + + // 4-bit -> 8-bit + const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // load y + const int8x16_t y0_l = vld1q_s8(b_y0->qs); + const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16); + const int8x16_t y1_l = vld1q_s8(b_y1->qs); + const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); + + // mmla into int32x4_t + float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), + GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)}; + + int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + + int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + + int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + + int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), + l1, r1)), l2, r2)), l3, r3))), scale); + } + + float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2); + float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); + sumv2 = sumv2 + summs0; + + vst1_f32(s, vget_low_f32(sumv2)); + vst1_f32(s + bs, vget_high_f32(sumv2)); + return; + } +#endif // TODO: add WASM SIMD #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -4107,12 +4256,17 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); assert(qk == QK5_0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q5_0 * restrict x = vx; const block_q8_0 * restrict y = vy; @@ -4393,12 +4547,17 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_1; const int nb = n / qk; assert(n % qk == 0); assert(qk == QK5_1); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q5_1 * restrict x = vx; const block_q8_1 * restrict y = vy; @@ -4692,15 +4851,75 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); +#if defined(__ARM_FEATURE_MATMUL_INT8) + assert((nrc == 2) || (nrc == 1)); +#else + assert(nrc == 1); +#endif const block_q8_0 * restrict x = vx; const block_q8_0 * restrict y = vy; +#if defined(__ARM_FEATURE_MATMUL_INT8) + if (nrc == 2) { + const block_q8_0 * restrict vx0 = vx; + const block_q8_0 * restrict vx1 = vx + bx; + const block_q8_0 * restrict vy0 = vy; + const block_q8_0 * restrict vy1 = vy + by; + + float32x4_t sumv0 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i++) { + const block_q8_0 * restrict b_x0 = &vx0[i]; + const block_q8_0 * restrict b_y0 = &vy0[i]; + + const block_q8_0 * restrict b_x1 = &vx1[i]; + const block_q8_0 * restrict b_y1 = &vy1[i]; + + const int8x16_t x0_l = vld1q_s8(b_x0->qs); + const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16); + const int8x16_t x1_l = vld1q_s8(b_x1->qs); + const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16); + + // load y + const int8x16_t y0_l = vld1q_s8(b_y0->qs); + const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16); + const int8x16_t y1_l = vld1q_s8(b_y1->qs); + const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); + + float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), + GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)}; + + int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); + + int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); + + int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); + + int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); + + sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), + l1, r1)), l2, r2)), l3, r3))), scale); + } + float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2); + float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); + + vst1_f32(s, vget_low_f32(sumv2)); + vst1_f32(s + bs, vget_high_f32(sumv2)); + return; + } +#endif #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); @@ -4795,7 +5014,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri } #if QK_K == 256 -void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q2_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -5171,7 +5395,12 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q2_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -5429,8 +5658,13 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri #endif #if QK_K == 256 -void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; @@ -5949,8 +6183,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q3_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -6292,8 +6531,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri #endif #if QK_K == 256 -void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q4_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -6648,8 +6892,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri #endif } #else -void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q4_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -6891,8 +7140,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri #endif #if QK_K == 256 -void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q5_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -7311,8 +7565,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q5_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -7577,8 +7836,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #if QK_K == 256 -void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q6_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -8009,8 +8273,13 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q6_K * restrict x = vx; const block_q8_K * restrict y = vy; @@ -8339,8 +8608,13 @@ static const int8_t keven_signs_q2xs[1024] = { 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, }; -void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_iq2_xxs * restrict x = vx; const block_q8_K * restrict y = vy; @@ -8462,8 +8736,13 @@ void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * res #endif } -void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_iq2_xs * restrict x = vx; const block_q8_K * restrict y = vy; @@ -8682,8 +8961,13 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest } // TODO -void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_iq3_xxs * restrict x = vx; const block_q8_K * restrict y = vy; diff --git a/ggml-quants.h b/ggml-quants.h index bfdf3c997..68f09b1e1 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -245,20 +245,20 @@ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_ void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); // Dot product -void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); // // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") diff --git a/ggml.c b/ggml.c index 86cd65862..e45b78d7e 100644 --- a/ggml.c +++ b/ggml.c @@ -428,8 +428,8 @@ int64_t ggml_cycles_per_ms(void) { static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); -static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); -static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y); +static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc); +static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc); static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { @@ -457,6 +457,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .is_quantized = false, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, .vec_dot_type = GGML_TYPE_F32, + .nrows = 1, }, [GGML_TYPE_F16] = { .type_name = "f16", @@ -468,6 +469,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, .vec_dot_type = GGML_TYPE_F16, + .nrows = 1, }, [GGML_TYPE_Q4_0] = { .type_name = "q4_0", @@ -479,6 +481,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference, .vec_dot = ggml_vec_dot_q4_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, +#if defined (__ARM_FEATURE_MATMUL_INT8) + .nrows = 2, +#else + .nrows = 1, +#endif }, [GGML_TYPE_Q4_1] = { .type_name = "q4_1", @@ -490,6 +497,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference, .vec_dot = ggml_vec_dot_q4_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, +#if defined (__ARM_FEATURE_MATMUL_INT8) + .nrows = 2, +#else + .nrows = 1, +#endif }, [4] = { // GGML_TYPE_Q4_2 .type_name = "DEPRECATED", @@ -501,6 +513,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = NULL, .vec_dot = NULL, .vec_dot_type = GGML_TYPE_COUNT, + .nrows = 1, }, [5] = { // GGML_TYPE_Q4_3 .type_name = "DEPRECATED", @@ -512,6 +525,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = NULL, .vec_dot = NULL, .vec_dot_type = GGML_TYPE_COUNT, + .nrows = 1, }, [GGML_TYPE_Q5_0] = { .type_name = "q5_0", @@ -523,6 +537,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference, .vec_dot = ggml_vec_dot_q5_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, }, [GGML_TYPE_Q5_1] = { .type_name = "q5_1", @@ -534,6 +549,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference, .vec_dot = ggml_vec_dot_q5_1_q8_1, .vec_dot_type = GGML_TYPE_Q8_1, + .nrows = 1, }, [GGML_TYPE_Q8_0] = { .type_name = "q8_0", @@ -545,6 +561,11 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference, .vec_dot = ggml_vec_dot_q8_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, +#if defined (__ARM_FEATURE_MATMUL_INT8) + .nrows = 2, +#else + .nrows = 1, +#endif }, [GGML_TYPE_Q8_1] = { .type_name = "q8_1", @@ -554,6 +575,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float = quantize_row_q8_1, .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference, .vec_dot_type = GGML_TYPE_Q8_1, + .nrows = 1, }, [GGML_TYPE_Q2_K] = { .type_name = "q2_K", @@ -565,6 +587,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference, .vec_dot = ggml_vec_dot_q2_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, }, [GGML_TYPE_Q3_K] = { .type_name = "q3_K", @@ -576,6 +599,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference, .vec_dot = ggml_vec_dot_q3_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, }, [GGML_TYPE_Q4_K] = { .type_name = "q4_K", @@ -587,6 +611,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference, .vec_dot = ggml_vec_dot_q4_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, }, [GGML_TYPE_Q5_K] = { .type_name = "q5_K", @@ -598,6 +623,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference, .vec_dot = ggml_vec_dot_q5_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, }, [GGML_TYPE_Q6_K] = { .type_name = "q6_K", @@ -609,6 +635,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference, .vec_dot = ggml_vec_dot_q6_K_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, }, [GGML_TYPE_IQ2_XXS] = { .type_name = "iq2_xxs", @@ -620,6 +647,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = NULL, .vec_dot = ggml_vec_dot_iq2_xxs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, }, [GGML_TYPE_IQ2_XS] = { .type_name = "iq2_xs", @@ -631,6 +659,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = NULL, .vec_dot = ggml_vec_dot_iq2_xs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, }, [GGML_TYPE_IQ3_XXS] = { .type_name = "iq3_xxs", @@ -642,6 +671,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference, .vec_dot = ggml_vec_dot_iq3_xxs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, }, [GGML_TYPE_Q8_K] = { .type_name = "q8_K", @@ -1212,7 +1242,13 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } -static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { +static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + #ifdef GGML_SIMD float sumf = 0.0f; const int np = (n & ~(GGML_F32_STEP - 1)); @@ -1249,7 +1285,13 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest *s = sumf; } -static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { +static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + ggml_float sumf = 0.0; #if defined(GGML_SIMD) @@ -1455,7 +1497,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { #endif } -inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); } +inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); } inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); } @@ -9992,6 +10034,7 @@ static void ggml_compute_forward_mul_mat( ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + int64_t const vec_dot_num_rows = type_traits[type].nrows; GGML_ASSERT(ne0 == ne01); GGML_ASSERT(ne1 == ne11); @@ -10159,12 +10202,23 @@ static void ggml_compute_forward_mul_mat( const int64_t blck_0 = 16; const int64_t blck_1 = 16; + // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols + int64_t nrc = vec_dot_num_rows; + // TODO: currently the mmla kernels support only even numbered rows/cols. + // this check can be removed once they are extended to support odd numbered rows/cols too + if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) { + nrc = 1; + } + + const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11; + // attempt to reduce false-sharing (does not seem to make a difference) - float tmp[16]; + // 16 * 2, accounting for mmla kernels + float tmp[32]; for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { - for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) { const int64_t i13 = (ir1/(ne12*ne1)); const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1; const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1); @@ -10187,17 +10241,19 @@ static void ggml_compute_forward_mul_mat( (src1_cont || src1->type != vec_dot_type ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size : (i11*nb11 + i12*nb12 + i13*nb13)); - float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); //} - for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { - vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col); + for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) { + vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc); + } + + for (int cn = 0; cn < nrc; ++cn) { + memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); } - memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); } } } @@ -10386,7 +10442,7 @@ static void ggml_compute_forward_mul_mat_id( //} for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { - vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col); + vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1); } memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); } @@ -11568,7 +11624,7 @@ static void ggml_compute_forward_soft_max_back_f32( // linear runtime, no additional memory float dot_y_dy = 0; - ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy); + ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1); ggml_vec_cpy_f32 (nc, dx, dy); ggml_vec_acc1_f32(nc, dx, -dot_y_dy); ggml_vec_mul_f32 (nc, dx, dx, y); @@ -12369,9 +12425,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( const int i1n = i10*ne11; for (int i00 = 0; i00 < ne00; i00++) { float v = 0; - ggml_vec_dot_f16(ne02, &v, - (ggml_fp16_t *) wdata_src + i1n, - (ggml_fp16_t *) wdata_kernel + i00*ne02); + ggml_vec_dot_f16(ne02, &v, 0, + (ggml_fp16_t *) wdata_src + i1n, 0, + (ggml_fp16_t *) wdata_kernel + i00*ne02, 0, 1); dst_data[i10*s0 + i00] += v; } } @@ -12466,9 +12522,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32( const int i1n = i10*ne11; for (int i00 = 0; i00 < ne00; i00++) { float v = 0; - ggml_vec_dot_f32(ne02, &v, - wdata_src + i1n, - wdata_kernel + i00*ne02); + ggml_vec_dot_f32(ne02, &v, 0, + wdata_src + i1n, 0, + wdata_kernel + i00*ne02, 0, 1); dst_data[i10*s0 + i00] += v; } } @@ -12783,9 +12839,9 @@ static void ggml_compute_forward_conv_transpose_2d( for (int i01 = 0; i01 < ne01; i01++) { for (int i00 = 0; i00 < ne00; i00++) { float v = 0; - ggml_vec_dot_f16(ne03, &v, - wdata_src + i1n, - wdata_kernel + i01*ne00*ne03 + i00*ne03); + ggml_vec_dot_f16(ne03, &v, 0, + wdata_src + i1n, 0, + wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1); dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v; } } @@ -13214,9 +13270,9 @@ static void ggml_compute_forward_flash_attn_f32( const int i1 = ik1; ggml_vec_dot_f32(neq0, - S + i1, - (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + S + i1, 0, + (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); } // scale @@ -13299,9 +13355,9 @@ static void ggml_compute_forward_flash_attn_f32( const int iv3 = iq3; ggml_vec_dot_f32(masked_begin, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), - S); + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0, + (float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0, + S, 0, 1); } } } @@ -13404,9 +13460,9 @@ static void ggml_compute_forward_flash_attn_f16( const int i1 = ik1; ggml_vec_dot_f16(neq0, - S + i1, - (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + S + i1, 0, + (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, + (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); } } else { for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) { @@ -13508,9 +13564,9 @@ static void ggml_compute_forward_flash_attn_f16( const int iv3 = iq3; ggml_vec_dot_f16(nev0, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), - S16); + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0, + (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0, + S16, 0, 1); } } else { for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) { @@ -13652,9 +13708,9 @@ static void ggml_compute_forward_flash_ff_f16( const int i1 = ib01; ggml_vec_dot_f16(nea0, - S + i1, - (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), - (ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3))); + S + i1, 0, + (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0, + (ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1); } ggml_vec_add_f32(neb01, S, S, (float *) b1->data); @@ -13677,9 +13733,9 @@ static void ggml_compute_forward_flash_ff_f16( for (int64_t ic = 0; ic < nec01; ++ic) { ggml_vec_dot_f16(neb01, - (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), - S16); + (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0, + (ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0, + S16, 0, 1); } ggml_vec_add_f32(nec01, @@ -13866,9 +13922,9 @@ static void ggml_compute_forward_flash_attn_back_f32( const int i1 = ik1; ggml_vec_dot_f32(neq0, - S + i1, - (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + S + i1, 0, + (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1); } // scale @@ -14013,7 +14069,7 @@ static void ggml_compute_forward_flash_attn_back_f32( // S = SM * (S - dot(SM, S)) float dot_SM_gradSM = 0; - ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, SM, S); + ggml_vec_dot_f32 (masked_begin, &dot_SM_gradSM, 0, SM, 0, S, 0, 1); ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); ggml_vec_mul_f32 (masked_begin, S, S, SM); @@ -18382,7 +18438,7 @@ static enum ggml_opt_result linesearch_backtracking( } // compute the initial gradient in the search direction - ggml_vec_dot_f32(nx, &dginit, g, d); + ggml_vec_dot_f32(nx, &dginit, 0, g, 0, d, 0, 1); // make sure that d points to a descent direction if (0 < dginit) { @@ -18432,7 +18488,7 @@ static enum ggml_opt_result linesearch_backtracking( return count; } - ggml_vec_dot_f32(nx, &dg, g, d); + ggml_vec_dot_f32(nx, &dg, 0, g, 0, d, 0, 1); // check the Wolfe condition if (dg < params->lbfgs.wolfe * dginit) { @@ -18693,8 +18749,8 @@ static enum ggml_opt_result ggml_opt_lbfgs( // ys = y^t \cdot s -> 1 / \rho. // yy = y^t \cdot y. // - ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]); - ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]); + ggml_vec_dot_f32(nx, &ys, 0, &lm_y[end[0]*nx], 0, &lm_s[end[0]*nx], 0, 1); + ggml_vec_dot_f32(nx, &yy, 0, &lm_y[end[0]*nx], 0, &lm_y[end[0]*nx], 0, 1); lm_ys[end[0]] = ys; @@ -18713,7 +18769,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( for (int i = 0; i < bound; ++i) { j[0] = (j[0] + m - 1) % m; // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1} - ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d); + ggml_vec_dot_f32(nx, &lm_alpha[j[0]], 0, &lm_s[j[0]*nx], 0, d, 0, 1); lm_alpha[j[0]] /= lm_ys[j[0]]; // q_{i} = q_{i+1} - \alpha_{i} y_{i} ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]); @@ -18723,7 +18779,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( for (int i = 0; i < bound; ++i) { // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i} - ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d); + ggml_vec_dot_f32(nx, &beta, 0, &lm_y[j[0]*nx], 0, d, 0, 1); beta /= lm_ys[j[0]]; // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j} ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta); @@ -20611,4 +20667,12 @@ int ggml_cpu_has_vsx(void) { #endif } +int ggml_cpu_has_matmul_int8(void) { +#if defined(__ARM_FEATURE_MATMUL_INT8) + return 1; +#else + return 0; +#endif +} + //////////////////////////////////////////////////////////////////////////////// diff --git a/ggml.h b/ggml.h index 1360cd8ee..9cfec5bac 100644 --- a/ggml.h +++ b/ggml.h @@ -2278,6 +2278,7 @@ extern "C" { GGML_API int ggml_cpu_has_ssse3 (void); GGML_API int ggml_cpu_has_sycl (void); GGML_API int ggml_cpu_has_vsx (void); + GGML_API int ggml_cpu_has_matmul_int8(void); // // Internal types and functions exposed for tests and benchmarks @@ -2291,7 +2292,8 @@ extern "C" { #endif typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); - typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); + typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, + const void * GGML_RESTRICT y, size_t by, int nrc); typedef struct { const char * type_name; @@ -2303,6 +2305,7 @@ extern "C" { ggml_from_float_t from_float_reference; ggml_vec_dot_t vec_dot; enum ggml_type vec_dot_type; + int64_t nrows; // number of rows to process simultaneously; } ggml_type_traits_t; GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); diff --git a/llama.cpp b/llama.cpp index 0566b087b..3f39a67fb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11869,6 +11869,7 @@ const char * llama_print_system_info(void) { s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; + s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | "; return s.c_str(); } diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp index 111770d55..1a52ff5e9 100644 --- a/pocs/vdot/q8dot.cpp +++ b/pocs/vdot/q8dot.cpp @@ -156,8 +156,8 @@ int main(int argc, char** argv) { t1 = std::chrono::high_resolution_clock::now(); float fs; - if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, x40.data(), y.data()); - else funcs.vec_dot(kVecSize * QK4_1, &fs, x41.data(), y.data()); + if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1); + else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1); t2 = std::chrono::high_resolution_clock::now(); t = 1e-3*std::chrono::duration_cast(t2-t1).count(); if (iloop > 3) ggml.addResult(fs, t); diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp index 73ffcd1ca..17e9e4482 100644 --- a/pocs/vdot/vdot.cpp +++ b/pocs/vdot/vdot.cpp @@ -284,8 +284,8 @@ int main(int argc, char** argv) { else { auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type); vdot.from_float(y1.data(), q8.data(), kVecSize); - if (useQ4_1) funcs.vec_dot(kVecSize, &result, q41.data(), q8.data()); - else funcs.vec_dot(kVecSize, &result, q40.data(), q8.data()); + if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1); + else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1); } sumq += result; t2 = std::chrono::high_resolution_clock::now(); diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index 43df8022d..5e92d5742 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -87,7 +87,7 @@ static float dot_product_error( vdot.from_float(test_data2, tmp_q2.data(), test_size); float result = INFINITY; - qfns.vec_dot(test_size, &result, tmp_q1.data(), tmp_q2.data()); + qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1); const float dot_ref = dot_product(test_data1, test_data2, test_size); diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index 8ec817344..48d9fae3d 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -346,7 +346,7 @@ int main(int argc, char * argv[]) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { float result; - qfns.vec_dot(size, &result, test_q1, test_q2); + qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1); return result; }; size_t quantized_size = ggml_row_size(type, size); From 0f2411f154db46780d3aaa3a0664691b2170c83f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 11 Feb 2024 15:33:01 +0200 Subject: [PATCH 15/20] ggml : fix compile warnings (unused vars) (#4966) --- ggml-quants.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ggml-quants.c b/ggml-quants.c index 6c122dd2a..b2a309bf8 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -3689,6 +3689,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r #else assert(nrc == 1); #endif + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q4_0 * restrict x = vx; const block_q8_0 * restrict y = vy; @@ -4052,6 +4056,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r #else assert(nrc == 1); #endif + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q4_1 * restrict x = vx; const block_q8_1 * restrict y = vy; @@ -4861,6 +4869,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r #else assert(nrc == 1); #endif + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); const block_q8_0 * restrict x = vx; const block_q8_0 * restrict y = vy; From 139b62a839825ef20084ed75ed624db7a5ad554a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 11 Feb 2024 15:33:43 +0200 Subject: [PATCH 16/20] common : fix compile warning --- common/sampling.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 844ad7c53..82cbdecea 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -127,8 +127,6 @@ static void sampler_queue( const llama_sampling_params & params, llama_token_data_array & cur_p, size_t & min_keep) { - const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); - const float temp = params.temp; const float dynatemp_range = params.dynatemp_range; const float dynatemp_exponent = params.dynatemp_exponent; From 85910c5b30f6e268321be8df044f5528a6efac52 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 11 Feb 2024 15:35:50 +0200 Subject: [PATCH 17/20] main : ctrl+C print timing in non-interactive mode (#3873) --- examples/main/main.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 0ed4d79f9..e8ab8cbae 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -98,7 +98,7 @@ static void write_logfile( #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) static void sigint_handler(int signo) { if (signo == SIGINT) { - if (!is_interacting) { + if (!is_interacting && g_params->interactive) { is_interacting = true; } else { console::cleanup(); @@ -392,7 +392,8 @@ int main(int argc, char ** argv) { LOG_TEE("\n"); } - if (params.interactive) { + // ctrl+C handling + { #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; sigint_action.sa_handler = sigint_handler; @@ -405,7 +406,9 @@ int main(int argc, char ** argv) { }; SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif + } + if (params.interactive) { LOG_TEE("%s: interactive mode on.\n", __func__); if (!params.antiprompt.empty()) { From 684780141a08200ec98eba3e982dbafd1d0b5000 Mon Sep 17 00:00:00 2001 From: Alexey Parfenov Date: Sun, 11 Feb 2024 13:38:14 +0000 Subject: [PATCH 18/20] server : allow to specify tokens as strings in logit_bias (#5003) * server: allow to specify tokens as strings in logit_bias * Apply suggestions from code review Co-authored-by: Georgi Gerganov --------- Co-authored-by: Georgi Gerganov --- examples/server/README.md | 2 +- examples/server/server.cpp | 32 +++++++++++++++++++++++++------- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 1db7cdf21..0f7373ae8 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -185,7 +185,7 @@ node index.js `ignore_eos`: Ignore end of stream token and continue generating (default: false). - `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []). + `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. (default: []). `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 4d212f1f0..1699eb76b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -626,18 +626,36 @@ struct llama_server_context const int n_vocab = llama_n_vocab(model); for (const auto &el : *logit_bias) { - if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) + if (el.is_array() && el.size() == 2) { - llama_token tok = el[0].get(); - if (tok >= 0 && tok < n_vocab) + float bias; + if (el[1].is_number()) { - if (el[1].is_number()) + bias = el[1].get(); + } + else if (el[1].is_boolean() && !el[1].get()) + { + bias = -INFINITY; + } + else + { + continue; + } + + if (el[0].is_number_integer()) + { + llama_token tok = el[0].get(); + if (tok >= 0 && tok < n_vocab) { - slot->sparams.logit_bias[tok] = el[1].get(); + slot->sparams.logit_bias[tok] = bias; } - else if (el[1].is_boolean() && !el[1].get()) + } + else if (el[0].is_string()) + { + auto toks = llama_tokenize(model, el[0].get(), false); + for (auto tok : toks) { - slot->sparams.logit_bias[tok] = -INFINITY; + slot->sparams.logit_bias[tok] = bias; } } } From a803333a4e6fc534c93afe90d741bc2388bdec87 Mon Sep 17 00:00:00 2001 From: Alexey Parfenov Date: Sun, 11 Feb 2024 13:43:31 +0000 Subject: [PATCH 19/20] common : use enums for sampler types (#5418) * common: use enums for sampler types * Apply suggestions from code review Co-authored-by: Georgi Gerganov * minor : spaces --------- Co-authored-by: Georgi Gerganov --- common/common.cpp | 117 +++++++++++++++++++++++++++++++------------- common/common.h | 7 ++- common/sampling.cpp | 31 +++++------- common/sampling.h | 20 +++++++- 4 files changed, 120 insertions(+), 55 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 9a489a553..f64da2cb6 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -340,13 +340,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - sparams.samplers_sequence = parse_samplers_input(argv[i]); + const auto sampler_names = string_split(argv[i], ';'); + sparams.samplers_sequence = sampler_types_from_names(sampler_names); } else if (arg == "--sampling-seq") { if (++i >= argc) { invalid_param = true; break; } - sparams.samplers_sequence = argv[i]; + sparams.samplers_sequence = sampler_types_from_chars(argv[i]); } else if (arg == "--top-p") { if (++i >= argc) { invalid_param = true; @@ -906,6 +907,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { const llama_sampling_params & sparams = params.sparams; + std::string sampler_type_chars; + std::string sampler_type_names; + for (const auto sampler_type : sparams.samplers_sequence) { + sampler_type_chars += static_cast(sampler_type); + sampler_type_names += sampler_type_to_name_string(sampler_type) + ";"; + } + sampler_type_names.pop_back(); + printf("\n"); printf("usage: %s [options]\n", argv[0]); printf("\n"); @@ -947,8 +956,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - printf(" --samplers samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n"); - printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str()); + printf(" --samplers samplers that will be used for generation in the order, separated by \';\' (default: %s)\n", sampler_type_names.c_str()); + printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str()); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p); @@ -1097,45 +1106,85 @@ std::string gpt_random_prompt(std::mt19937 & rng) { } // -// String parsing +// String utils // -std::string parse_samplers_input(std::string input) { - std::string output = ""; +std::vector string_split(std::string input, char separator) { + std::vector parts; + size_t separator_pos = input.find(separator); + while (separator_pos != std::string::npos) { + std::string part = input.substr(0, separator_pos); + parts.emplace_back(part); + input = input.substr(separator_pos + 1); + separator_pos = input.find(separator); + } + parts.emplace_back(input); + return parts; +} + +std::vector sampler_types_from_names(const std::vector & names) { // since samplers names are written multiple ways // make it ready for both system names and input names - std::unordered_map samplers_symbols { - {"top_k", 'k'}, - {"top-k", 'k'}, - {"top_p", 'p'}, - {"top-p", 'p'}, - {"nucleus", 'p'}, - {"typical_p", 'y'}, - {"typical-p", 'y'}, - {"typical", 'y'}, - {"min_p", 'm'}, - {"min-p", 'm'}, - {"tfs_z", 'f'}, - {"tfs-z", 'f'}, - {"tfs", 'f'}, - {"temp", 't'}, - {"temperature",'t'} + std::unordered_map sampler_name_map { + {"top_k", llama_sampler_type::TOP_K}, + {"top-k", llama_sampler_type::TOP_K}, + {"top_p", llama_sampler_type::TOP_P}, + {"top-p", llama_sampler_type::TOP_P}, + {"nucleus", llama_sampler_type::TOP_P}, + {"typical_p", llama_sampler_type::TYPICAL_P}, + {"typical-p", llama_sampler_type::TYPICAL_P}, + {"typical", llama_sampler_type::TYPICAL_P}, + {"min_p", llama_sampler_type::MIN_P}, + {"min-p", llama_sampler_type::MIN_P}, + {"tfs_z", llama_sampler_type::TFS_Z}, + {"tfs-z", llama_sampler_type::TFS_Z}, + {"tfs", llama_sampler_type::TFS_Z}, + {"temp", llama_sampler_type::TEMP}, + {"temperature", llama_sampler_type::TEMP} }; - // expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p" - size_t separator = input.find(';'); - while (separator != input.npos) { - std::string name = input.substr(0,separator); - input = input.substr(separator+1); - separator = input.find(';'); - if (samplers_symbols.find(name) != samplers_symbols.end()) { - output += samplers_symbols[name]; + std::vector sampler_types; + sampler_types.reserve(names.size()); + for (const auto& name : names) { + const auto sampler_item = sampler_name_map.find(name); + if (sampler_item != sampler_name_map.end()) { + sampler_types.push_back(sampler_item->second); } } - if (samplers_symbols.find(input) != samplers_symbols.end()) { - output += samplers_symbols[input]; + return sampler_types; +} + +std::vector sampler_types_from_chars(const std::string & names_string) { + std::unordered_map sampler_name_map { + {'k', llama_sampler_type::TOP_K}, + {'p', llama_sampler_type::TOP_P}, + {'y', llama_sampler_type::TYPICAL_P}, + {'m', llama_sampler_type::MIN_P}, + {'f', llama_sampler_type::TFS_Z}, + {'t', llama_sampler_type::TEMP} + }; + + std::vector sampler_types; + sampler_types.reserve(names_string.size()); + for (const auto & c : names_string) { + const auto sampler_item = sampler_name_map.find(c); + if (sampler_item != sampler_name_map.end()) { + sampler_types.push_back(sampler_item->second); + } + } + return sampler_types; +} + +std::string sampler_type_to_name_string(llama_sampler_type sampler_type) { + switch (sampler_type) { + case llama_sampler_type::TOP_K: return "top_k"; + case llama_sampler_type::TFS_Z: return "tfs_z"; + case llama_sampler_type::TYPICAL_P: return "typical_p"; + case llama_sampler_type::TOP_P: return "top_p"; + case llama_sampler_type::MIN_P: return "min_p"; + case llama_sampler_type::TEMP: return "temp"; + default : return ""; } - return output; } // diff --git a/common/common.h b/common/common.h index 62de25d6a..9bdd45cf9 100644 --- a/common/common.h +++ b/common/common.h @@ -162,10 +162,13 @@ std::string gpt_random_prompt(std::mt19937 & rng); void process_escapes(std::string& input); // -// String parsing +// String utils // -std::string parse_samplers_input(std::string input); +std::vector sampler_types_from_names(const std::vector & names); +std::vector sampler_types_from_chars(const std::string & names_string); +std::vector string_split(std::string input, char separator); +std::string sampler_type_to_name_string(llama_sampler_type sampler_type); // // Model utils diff --git a/common/sampling.cpp b/common/sampling.cpp index 82cbdecea..a001750da 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -103,15 +103,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) { std::string llama_sampling_order_print(const llama_sampling_params & params) { std::string result = "CFG -> Penalties "; if (params.mirostat == 0) { - for (auto s : params.samplers_sequence) { - switch (s) { - case 'k': result += "-> top_k "; break; - case 'f': result += "-> tfs_z "; break; - case 'y': result += "-> typical_p "; break; - case 'p': result += "-> top_p "; break; - case 'm': result += "-> min_p "; break; - case 't': result += "-> temp "; break; - default : break; + for (auto sampler_type : params.samplers_sequence) { + const auto sampler_type_name = sampler_type_to_name_string(sampler_type); + if (!sampler_type_name.empty()) { + result += "-> " + sampler_type_name + " "; } } } else { @@ -135,16 +130,16 @@ static void sampler_queue( const float min_p = params.min_p; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; - const std::string & samplers_sequence = params.samplers_sequence; + const std::vector & samplers_sequence = params.samplers_sequence; - for (auto s : samplers_sequence) { - switch (s){ - case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break; - case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break; - case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; - case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; - case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; - case 't': + for (auto sampler_type : samplers_sequence) { + switch (sampler_type) { + case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break; + case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break; + case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; + case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; + case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; + case llama_sampler_type::TEMP: if (dynatemp_range > 0) { float dynatemp_min = std::max(0.0f, temp - dynatemp_range); float dynatemp_max = std::max(0.0f, temp + dynatemp_range); diff --git a/common/sampling.h b/common/sampling.h index 88899c094..2bd6a75d2 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -8,6 +8,16 @@ #include #include +// sampler types +enum class llama_sampler_type : char { + TOP_K = 'k', + TOP_P = 'p', + MIN_P = 'm', + TFS_Z = 'f', + TYPICAL_P = 'y', + TEMP = 't' +}; + // sampling parameters typedef struct llama_sampling_params { int32_t n_prev = 64; // number of previous tokens to remember @@ -28,7 +38,15 @@ typedef struct llama_sampling_params { float mirostat_tau = 5.00f; // target entropy float mirostat_eta = 0.10f; // learning rate bool penalize_nl = true; // consider newlines as a repeatable token - std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp + + std::vector samplers_sequence = { + llama_sampler_type::TOP_K, + llama_sampler_type::TFS_Z, + llama_sampler_type::TYPICAL_P, + llama_sampler_type::TOP_P, + llama_sampler_type::MIN_P, + llama_sampler_type::TEMP + }; std::string grammar; // optional BNF-like grammar to constrain sampling From c88c74f967028ae3d5ebade40ae586d20a961abc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20L=C3=B3pez?= Date: Sun, 11 Feb 2024 15:12:00 +0100 Subject: [PATCH 20/20] vulkan: only use M-sized matmul on Apple GPUs (#5412) * vulkan: refactor guess_matmul_pipeline for vendor Refactor ggml_vk_guess_matmul_pipeline to simplify adding per-vendor conditionals. Signed-off-by: Sergio Lopez * vulkan: only use M-sized matmul on Apple GPUs L-sized and S-sized matmuls are broken on Apple GPUs, force using M-size with this vendor. Signed-off-by: Sergio Lopez --------- Signed-off-by: Sergio Lopez --- ggml-vulkan.cpp | 103 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 93 insertions(+), 10 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 254f648a6..7834e635c 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -27,6 +27,7 @@ #define CEIL_DIV(M, N) (((M) + (N)-1) / (N)) #define VK_VENDOR_ID_AMD 0x1002 +#define VK_VENDOR_ID_APPLE 0x106b #define VK_VENDOR_ID_INTEL 0x8086 #define VK_VENDOR_ID_NVIDIA 0x10de @@ -2034,18 +2035,100 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct return ctx->pipeline_matmul_f32_aligned_l.align; } -static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")"; -#endif +static vk_pipeline* ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) { if (bit16_x && bit16_y) { - if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { + if (m <= 32 || n <= 32) { #ifdef GGML_VULKAN_DEBUG std::cerr << " S" << std::endl; #endif return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s; } - if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << " M" << std::endl; +#endif + return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m; + } + if (bit16_x && !bit16_y) { + if (m <= 32 || n <= 32) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << " S" << std::endl; +#endif + return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s; + } +#ifdef GGML_VULKAN_DEBUG + std::cerr << " M" << std::endl; +#endif + return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m; + } + if (!bit16_x && bit16_y) { + GGML_ASSERT(false); + } + + if (m <= 32 || n <= 32) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << " S" << std::endl; +#endif + return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s; + } +#ifdef GGML_VULKAN_DEBUG + std::cerr << " M" << std::endl; +#endif + return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m; +} + +static vk_pipeline* ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << " M" << std::endl; +#endif + if (bit16_x && bit16_y) { + return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m; + } + if (bit16_x && !bit16_y) { + return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m; + } + if (!bit16_x && bit16_y) { + GGML_ASSERT(false); + } + return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m; +} + +static vk_pipeline* ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << " S" << std::endl; +#endif + if (bit16_x && bit16_y) { + return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s; + } + if (bit16_x && !bit16_y) { + return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s; + } + if (!bit16_x && bit16_y) { + GGML_ASSERT(false); + } + return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s; +} + +static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")"; +#endif + switch (ctx->device.lock()->vendor_id) { + case VK_VENDOR_ID_AMD: + return ggml_vk_guess_matmul_pipeline_amd(ctx, bit16_x, bit16_y, m, n, aligned); + case VK_VENDOR_ID_APPLE: + return ggml_vk_guess_matmul_pipeline_apple(ctx, bit16_x, bit16_y, aligned); + case VK_VENDOR_ID_INTEL: + return ggml_vk_guess_matmul_pipeline_intel(ctx, bit16_x, bit16_y, aligned); + } + + if (bit16_x && bit16_y) { + if (m <= 32 || n <= 32) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << " S" << std::endl; +#endif + return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s; + } + if (m <= 64 || n <= 64) { #ifdef GGML_VULKAN_DEBUG std::cerr << " M" << std::endl; #endif @@ -2057,13 +2140,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l; } if (bit16_x && !bit16_y) { - if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { + if (m <= 32 || n <= 32) { #ifdef GGML_VULKAN_DEBUG std::cerr << " S" << std::endl; #endif return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s; } - if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) { + if (m <= 64 || n <= 64) { #ifdef GGML_VULKAN_DEBUG std::cerr << " M" << std::endl; #endif @@ -2078,13 +2161,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, GGML_ASSERT(false); } - if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { + if (m <= 32 || n <= 32) { #ifdef GGML_VULKAN_DEBUG std::cerr << " S" << std::endl; #endif return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s; } - if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) { + if (m <= 64 || n <= 64) { #ifdef GGML_VULKAN_DEBUG std::cerr << " M" << std::endl; #endif