From 3dfda05956befb350745c5c2f7134d06adfe8724 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 15 Jul 2024 14:10:39 +0300 Subject: [PATCH 001/170] llama : de-duplicate deepseek2 norm --- src/llama.cpp | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 400a4232b..ddf0262d4 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -12873,12 +12873,12 @@ struct llm_build_context { struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - if ((uint32_t) il < hparams.n_layer_dense_lead) { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + if ((uint32_t) il < hparams.n_layer_dense_lead) { cur = llm_build_ffn(ctx0, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, @@ -12888,11 +12888,6 @@ struct llm_build_context { cb(cur, "ffn_out", il); } else { // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - ggml_tensor * moe_out = llm_build_moe_ffn(ctx0, cur, model.layers[il].ffn_gate_inp, From 16bdfa42acb09175e88cf97e9d9e4e48f616d120 Mon Sep 17 00:00:00 2001 From: "Meng, Hengyu" Date: Mon, 15 Jul 2024 19:32:15 +0800 Subject: [PATCH 002/170] [SYCL] add concat through dim 1/2 (#8483) * add concat through dim 1/2 --- ggml/src/ggml-sycl.cpp | 67 +---------- ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/concat.cpp | 195 +++++++++++++++++++++++++++++++++ ggml/src/ggml-sycl/concat.hpp | 21 ++++ 4 files changed, 218 insertions(+), 66 deletions(-) create mode 100644 ggml/src/ggml-sycl/concat.cpp create mode 100644 ggml/src/ggml-sycl/concat.hpp diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 5a890237f..36518ff93 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -291,29 +291,6 @@ static void sqr_f32(const float * x, float * dst, const int k, dst[i] = x[i] * x[i]; } -static void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02, - const sycl::nd_item<3> &item_ct1) { - int nidx = item_ct1.get_local_id(2) + - item_ct1.get_group(2) * item_ct1.get_local_range(2); - if (nidx >= ne0) { - return; - } - // operation - int offset_dst = nidx + item_ct1.get_group(1) * ne0 + - item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); - if (item_ct1.get_group(0) < ne02) { // src0 - int offset_src = - nidx + item_ct1.get_group(1) * ne0 + - item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); - dst[offset_dst] = x[offset_src]; - } else { - int offset_src = - nidx + item_ct1.get_group(1) * ne0 + - (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1); - dst[offset_dst] = y[offset_src]; - } -} - static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, const int ne12, const int ne13, const float sf0, const float sf1, @@ -1347,20 +1324,6 @@ static void sqr_f32_sycl(const float *x, float *dst, const int k, }); } -static void concat_f32_sycl(const float *x, const float *y, float *dst, - const int ne0, int ne1, int ne2, int ne02, - queue_ptr stream) { - int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE; - sycl::range<3> gridDim(ne2, ne1, num_blocks); - stream->parallel_for( - sycl::nd_range<3>(gridDim * - sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - concat_f32(x, y, dst, ne0, ne02, item_ct1); - }); -} - static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, const int ne12, const int ne13, const float sf0, const float sf1, @@ -2429,28 +2392,6 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor (void) src1_dd; } -inline void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { -#pragma message("TODO: generalize concat kernel for dim != 2") -#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7563") - int dim = dst->op_params[0]; - GGML_ASSERT(dim == 2); - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - for (int i3 = 0; i3 < dst->ne[3]; i3++) { - concat_f32_sycl(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream); - } - - (void) src1; - (void) dst; -} - inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, const float *src0_dd, const float *src1_dd, @@ -3359,12 +3300,6 @@ static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, const ggml_ten GGML_SYCL_DEBUG("call %s done\n", __func__); } -static void ggml_sycl_concat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_concat); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - static void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_upscale); @@ -4101,7 +4036,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens func = ggml_sycl_group_norm; break; case GGML_OP_CONCAT: - func = ggml_sycl_concat; + func = ggml_sycl_op_concat; break; case GGML_OP_UPSCALE: func = ggml_sycl_upscale; diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index 2a789edfc..067181de3 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -13,6 +13,7 @@ #ifndef GGML_SYCL_BACKEND_HPP #define GGML_SYCL_BACKEND_HPP +#include "concat.hpp" #include "common.hpp" #include "convert.hpp" #include "dequantize.hpp" diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp new file mode 100644 index 000000000..632eedb9d --- /dev/null +++ b/ggml/src/ggml-sycl/concat.cpp @@ -0,0 +1,195 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#include "concat.hpp" +#include "common.hpp" + +static void concat_f32_dim0(const float *x, const float *y, float *dst, + const int ne0, const int ne00, + const sycl::nd_item<3> &item_ct1) { + int nidx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (nidx >= ne0) { + return; + } + // operation + int offset_dst = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + if (nidx < ne00) { // src0 + int offset_src = nidx + item_ct1.get_group(1) * ne00 + + item_ct1.get_group(0) * ne00 * item_ct1.get_group_range(1); + dst[offset_dst] = x[offset_src]; + } else { + int offset_src = + nidx - ne00 + item_ct1.get_group(1) * (ne0 - ne00) + + item_ct1.get_group(0) * (ne0 - ne00) * item_ct1.get_group_range(1); + dst[offset_dst] = y[offset_src]; + } +} + +static void concat_f32_dim1(const float *x, const float *y, float *dst, + const int ne0, const int ne01, + const sycl::nd_item<3> &item_ct1) { + int nidx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (nidx >= ne0) { + return; + } + // operation + int offset_dst = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + if (item_ct1.get_group(1) < ne01) { // src0 + int offset_src = + nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01; + dst[offset_dst] = x[offset_src]; + } else { + int offset_src = + nidx + (item_ct1.get_group(1) - ne01) * ne0 + + item_ct1.get_group(0) * ne0 * (item_ct1.get_group_range(1) - ne01); + dst[offset_dst] = y[offset_src]; + } +} + +static void concat_f32_dim2(const float *x, const float *y, float *dst, + const int ne0, const int ne02, + const sycl::nd_item<3> &item_ct1) { + int nidx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (nidx >= ne0) { + return; + } + // operation + int offset_dst = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + if (item_ct1.get_group(0) < ne02) { // src0 + int offset_src = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + dst[offset_dst] = x[offset_src]; + } else { + int offset_src = + nidx + item_ct1.get_group(1) * ne0 + + (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1); + dst[offset_dst] = y[offset_src]; + } +} + +static void concat_f32_sycl(const float *x, const float *y, float *dst, + int ne00, int ne01, int ne02, int ne0, int ne1, + int ne2, int dim, queue_ptr stream) { + int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE; + sycl::range<3> gridDim(ne2, ne1, num_blocks); + switch (dim) { + case 0: + stream->parallel_for( + sycl::nd_range<3>(gridDim * + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1); + }); + break; + case 1: + stream->parallel_for( + sycl::nd_range<3>(gridDim * + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1); + }); + break; + default: + stream->parallel_for( + sycl::nd_range<3>(gridDim * + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1); + }); + break; + } +} + +// non-contiguous kernel (slow) +static void concat_f32_sycl_non_cont( + queue_ptr stream, const char *src0, const char *src1, char *dst, + int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, uint64_t nb00, + uint64_t nb01, uint64_t nb02, uint64_t nb03, int64_t /*ne10*/, + int64_t /*ne11*/, int64_t /*ne12*/, int64_t /*ne13*/, uint64_t nb10, + uint64_t nb11, uint64_t nb12, uint64_t nb13, int64_t ne0, int64_t ne1, + int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2, + uint64_t nb3, int32_t dim) { + sycl::range<3> gridDim(ne3, ne2, ne1); + stream->parallel_for( + sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + int64_t i3 = item_ct1.get_group(0); + int64_t i2 = item_ct1.get_group(1); + int64_t i1 = item_ct1.get_group(2); + + int64_t o[4] = {0, 0, 0, 0}; + o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03)); + + const float *x; + + for (int i0 = item_ct1.get_local_id(2); i0 < ne0; + i0 += item_ct1.get_local_range(2)) { + if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { + x = (const float *)(src0 + (i3)*nb03 + (i2)*nb02 + (i1)*nb01 + + (i0)*nb00); + } else { + x = (const float *)(src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 + + (i1 - o[1]) * nb11 + (i0 - o[0]) * nb10); + } + + float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0); + + *y = *x; + } + }); +} + +void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst) { + queue_ptr stream = ctx.stream(); + + const int32_t dim = ((int32_t *)dst->op_params)[0]; + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { + const float *src0_d = (const float *)src0->data; + const float *src1_d = (const float *)src1->data; + + float *dst_d = (float *)dst->data; + + if (dim != 3) { + for (int i3 = 0; i3 < dst->ne[3]; i3++) { + concat_f32_sycl( + src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4), + dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], + src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream); + } + } else { + const size_t size0 = ggml_nbytes(src0); + const size_t size1 = ggml_nbytes(src1); + + SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait())); + SYCL_CHECK(CHECK_TRY_ERROR( + stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait())); + } + } else + concat_f32_sycl_non_cont( + stream, (const char *)src0->data, (const char *)src1->data, + (char *)dst->data, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], + src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src1->ne[0], + src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1], + src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], + dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim); +} diff --git a/ggml/src/ggml-sycl/concat.hpp b/ggml/src/ggml-sycl/concat.hpp new file mode 100644 index 000000000..5a04feaab --- /dev/null +++ b/ggml/src/ggml-sycl/concat.hpp @@ -0,0 +1,21 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_CONCAT_HPP +#define GGML_SYCL_CONCAT_HPP + +#include "common.hpp" + +void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst); + +#endif // GGML_SYCL_CONCAT_HPP From fc690b018e459012cd82c37f1343e9bb658987d1 Mon Sep 17 00:00:00 2001 From: NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com> Date: Mon, 15 Jul 2024 04:46:39 -0700 Subject: [PATCH 003/170] docs: fix links in development docs [no ci] (#8481) Fixes a few links to within the repo that were broken in the reorganization of the documentation in #8325. --- docs/development/HOWTO-add-model.md | 16 ++++++++-------- .../token_generation_performance_tips.md | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/development/HOWTO-add-model.md b/docs/development/HOWTO-add-model.md index 2712b66c1..04c5ccbbe 100644 --- a/docs/development/HOWTO-add-model.md +++ b/docs/development/HOWTO-add-model.md @@ -9,15 +9,15 @@ Adding a model requires few steps: After following these steps, you can open PR. Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially: -- [main](../examples/main) -- [imatrix](../examples/imatrix) -- [quantize](../examples/quantize) -- [server](../examples/server) +- [main](/examples/main/) +- [imatrix](/examples/imatrix/) +- [quantize](/examples/quantize/) +- [server](/examples/server/) ### 1. Convert the model to GGUF This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library. -Depending on the model architecture, you can use either [convert_hf_to_gguf.py](../convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](../examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format). +Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format). The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors. @@ -31,7 +31,7 @@ class MyModel(Model): model_arch = gguf.MODEL_ARCH.GROK ``` -2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py) +2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py) Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`. @@ -54,7 +54,7 @@ Example for `falcon` model: As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist. -Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file. +Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](/gguf-py/gguf/tensor_mapping.py) file. If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it. @@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR. -Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback). +Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/). ## GGUF specification diff --git a/docs/development/token_generation_performance_tips.md b/docs/development/token_generation_performance_tips.md index c0840cad5..41b7232c9 100644 --- a/docs/development/token_generation_performance_tips.md +++ b/docs/development/token_generation_performance_tips.md @@ -1,7 +1,7 @@ # Token generation performance troubleshooting ## Verifying that the model is running on the GPU with CUDA -Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: +Make sure you compiled llama with the correct env variables according to [this guide](/docs/build.md#cuda), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: ```shell ./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some " ``` From 9104bc20edf47f74dc49379b7d61d0c6e85a4882 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 15 Jul 2024 14:54:58 +0300 Subject: [PATCH 004/170] common : add --no-cont-batching arg (#6358) --- common/common.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/common/common.cpp b/common/common.cpp index 1e5fc30dd..9035c3592 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -797,6 +797,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.cont_batching = true; return true; } + if (arg == "-nocb" || arg == "--no-cont-batching") { + params.cont_batching = false; + return true; + } if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; return true; @@ -1538,6 +1542,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel }); options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences }); options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" }); + options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" }); options.push_back({ "multi-modality" }); options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" }); From f17f39ff9cb71fa7b0eda0c649bb0ce9b2d8a6b8 Mon Sep 17 00:00:00 2001 From: M-A Date: Mon, 15 Jul 2024 08:04:56 -0400 Subject: [PATCH 005/170] server: update README.md with llama-server --help output [no ci] (#8472) The README.md had a stale information. In particular, the --ctx-size "defaults to 512" confused me and I had to check the code to confirm this was false. This the server is evolving rapidly, it's probably better to keep the source of truth at a single place (in the source) and generate the README.md based on that. Did: make llama-server ./llama-server --help > t.txt vimdiff t.txt examples/server/README.md I copied the content inside a backquote block. I would have preferred proper text but it would require a fair amount of surgery to make the current output compatible with markdown. A follow up could be to automate this process with a script. No functional change. --- examples/server/README.md | 334 +++++++++++++++++++++++++++++++------- 1 file changed, 273 insertions(+), 61 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index cb45ee06d..d7487ec1b 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -15,69 +15,281 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp. The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216). -**Command line options:** +## Usage -- `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response. -- `-t N`, `--threads N`: Set the number of threads to use by CPU layers during generation. Not used by model layers that are offloaded to GPU. This option has no effect when using the maximum number of GPU layers. Default: `std::thread::hardware_concurrency()` (number of CPU cores). -- `-tb N, --threads-batch N`: Set the number of threads to use by CPU layers during batch and prompt processing (>= 32 tokens). This option has no effect if a GPU is available. Default: `--threads`. -- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)` -- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). -- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused -- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository. Default: unused -- `-hff FILE, --hf-file FILE`: Hugging Face model file. Default: unused -- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. -- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is `512`, but LLaMA models were built with a context of `2048`, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of `4096`. -- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. -- `-mg i, --main-gpu i`: When using multiple GPUs, this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default, GPU `0` is used. -- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs, this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance. -- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048` -- `-ub N`, `--ubatch-size N`: Physical maximum batch size. Default: `512` -- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. -- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. -- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems -- `--numa distribute`: Spread execution evenly over all nodes -- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on -- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437 -- `--numa`: Attempt optimizations that may help on some NUMA systems. -- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. -- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. -- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600` -- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1` -- `--port`: Set the port to listen. Default: `8080` -- `--path`: Path from which to serve static files. Default: disabled -- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys. -- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s. -- `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled -- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. Values > 1 will allow for higher throughput with multiple parallel requests but the results will **not** be deterministic due to differences in rounding error. -- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled -- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime) -- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. -- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend. Used together with group attention width `--grp-attn-w`. Default: `1`, which is disabled. -- `--grp-attn-w`: Set the group attention width to extend context size through self-extend. Used together with group attention factor `--grp-attn-n`. Default: `512` -- `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1` -- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included. -- `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled -- `--slot-save-path PATH`: Specifies the path where the state of slots (the prompt cache) can be stored. If not provided, the slot management endpoints will be disabled. -- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) -- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled -- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json` -- `--rope-scaling` : RoPE scaling method. Defaults to linear unless otherwise specified by the model. Options are `none`, `linear`, `yarn` -- `--rope-freq-base N` : RoPE frequency base (default: loaded from model) -- `--rope-freq-scale N`: RoPE frequency scaling factor, expands context by a factor of 1/N (e.g. 0.25) -- `--yarn-ext-factor N` : YaRN: extrapolation mix factor (Default: 1.0, 0.0 = full interpolation) -- `--yarn-attn-factor N` : YaRN: scale sqrt(t) or attention magnitude (default: 1.0) -- `--yarn-beta-slow N`: YaRN: High correction dim or alpha (default: 1.0) -- `--yarn-beta-fast N`: YaRN: low correction dim or beta (default: 32.0) -- `--pooling` : Pooling type for embeddings, use model default if unspecified. Options are `none`, `mean`, `cls` -- `-dt N`, `--defrag-thold N`: KV cache defragmentation threshold (default: -1.0, < 0 = disabled) -- `-fa`, `--flash-attn` : enable flash attention (default: disabled). -- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`) -- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options) -- `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. +``` +usage: ./llama-server [options] + +general: + + -h, --help, --usage print usage and exit + --version show version and build info + -v, --verbose print verbose information + --verbosity N set specific verbosity level (default: 0) + --verbose-prompt print a verbose prompt before generation (default: false) + --no-display-prompt don't print prompt at generation (default: false) + -co, --color colorise output to distinguish prompt and user input from generations (default: false) + -s, --seed SEED RNG seed (default: -1, use random seed for < 0) + -t, --threads N number of threads to use during generation (default: 8) + -tb, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads) + -td, --threads-draft N number of threads to use during generation (default: same as --threads) + -tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default: same as --threads-draft) + --draft N number of tokens to draft for speculative decoding (default: 5) + -ps, --p-split N speculative decoding split probability (default: 0.1) + -lcs, --lookup-cache-static FNAME + path to static lookup cache to use for lookup decoding (not updated by generation) + -lcd, --lookup-cache-dynamic FNAME + path to dynamic lookup cache to use for lookup decoding (updated by generation) + -c, --ctx-size N size of the prompt context (default: 0, 0 = loaded from model) + -n, --predict N number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled) + -b, --batch-size N logical maximum batch size (default: 2048) + -ub, --ubatch-size N physical maximum batch size (default: 512) + --keep N number of tokens to keep from the initial prompt (default: 0, -1 = all) + --chunks N max number of chunks to process (default: -1, -1 = all) + -fa, --flash-attn enable Flash Attention (default: disabled) + -p, --prompt PROMPT prompt to start generation with + in conversation mode, this will be used as system prompt + (default: '') + -f, --file FNAME a file containing the prompt (default: none) + --in-file FNAME an input file (repeat to specify multiple files) + -bf, --binary-file FNAME binary file containing the prompt (default: none) + -e, --escape process escapes sequences (\n, \r, \t, \', \", \\) (default: true) + --no-escape do not process escape sequences + -ptc, --print-token-count N print token count every N tokens (default: -1) + --prompt-cache FNAME file to cache prompt state for faster startup (default: none) + --prompt-cache-all if specified, saves user input and generations to cache as well + not supported with --interactive or other interactive options + --prompt-cache-ro if specified, uses the prompt cache but does not update it + -r, --reverse-prompt PROMPT halt generation at PROMPT, return control in interactive mode + can be specified more than once for multiple prompts + -sp, --special special tokens output enabled (default: false) + -cnv, --conversation run in conversation mode, does not print special tokens and suffix/prefix + if suffix/prefix are not specified, default chat template will be used + (default: false) + -i, --interactive run in interactive mode (default: false) + -if, --interactive-first run in interactive mode and wait for input right away (default: false) + -mli, --multiline-input allows you to write or paste multiple lines without ending each in '\' + --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string + --in-prefix STRING string to prefix user inputs with (default: empty) + --in-suffix STRING string to suffix after user inputs with (default: empty) + --spm-infill use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) + +sampling: + + --samplers SAMPLERS samplers that will be used for generation in the order, separated by ';' + (default: top_k;tfs_z;typical_p;top_p;min_p;temperature) + --sampling-seq SEQUENCE simplified sequence for samplers that will be used (default: kfypmt) + --ignore-eos ignore end of stream token and continue generating (implies --logit-bias EOS-inf) + --penalize-nl penalize newline tokens (default: false) + --temp N temperature (default: 0.8) + --top-k N top-k sampling (default: 40, 0 = disabled) + --top-p N top-p sampling (default: 0.9, 1.0 = disabled) + --min-p N min-p sampling (default: 0.1, 0.0 = disabled) + --tfs N tail free sampling, parameter z (default: 1.0, 1.0 = disabled) + --typical N locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) + --repeat-last-n N last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) + --repeat-penalty N penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) + --presence-penalty N repeat alpha presence penalty (default: 0.0, 0.0 = disabled) + --frequency-penalty N repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) + --dynatemp-range N dynamic temperature range (default: 0.0, 0.0 = disabled) + --dynatemp-exp N dynamic temperature exponent (default: 1.0) + --mirostat N use Mirostat sampling. + Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used. + (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) + --mirostat-lr N Mirostat learning rate, parameter eta (default: 0.1) + --mirostat-ent N Mirostat target entropy, parameter tau (default: 5.0) + -l TOKEN_ID(+/-)BIAS modifies the likelihood of token appearing in the completion, + i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello', + or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' + --cfg-negative-prompt PROMPT + negative prompt to use for guidance (default: '') + --cfg-negative-prompt-file FNAME + negative prompt file to use for guidance + --cfg-scale N strength of guidance (default: 1.0, 1.0 = disable) + --chat-template JINJA_TEMPLATE + set custom jinja chat template (default: template taken from model's metadata) + if suffix/prefix are specified, template will be disabled + only commonly used templates are accepted: + https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template + +grammar: + + --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') + --grammar-file FNAME file to read grammar from + -j, --json-schema SCHEMA JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object + For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead + +embedding: + + --pooling {none,mean,cls,last} + pooling type for embeddings, use model default if unspecified + --attention {causal,non-causal} + attention type for embeddings, use model default if unspecified + +context hacking: + + --rope-scaling {none,linear,yarn} + RoPE frequency scaling method, defaults to linear unless specified by the model + --rope-scale N RoPE context scaling factor, expands context by a factor of N + --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model) + --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N + --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size) + --yarn-ext-factor N YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation) + --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0) + --yarn-beta-slow N YaRN: high correction dim or alpha (default: 1.0) + --yarn-beta-fast N YaRN: low correction dim or beta (default: 32.0) + -gan, --grp-attn-n N group-attention factor (default: 1) + -gaw, --grp-attn-w N group-attention width (default: 512.0) + -dkvc, --dump-kv-cache verbose print of the KV cache + -nkvo, --no-kv-offload disable KV offload + -ctk, --cache-type-k TYPE KV cache data type for K (default: f16) + -ctv, --cache-type-v TYPE KV cache data type for V (default: f16) + +perplexity: + + --all-logits return logits for all tokens in the batch (default: false) + --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f + --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: 400) + --winogrande compute Winogrande score over random tasks from datafile supplied with -f + --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: 0) + --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f + --multiple-choice-tasks N + number of tasks to use when computing the multiple choice score (default: 0) + --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base + --ppl-stride N stride for perplexity calculation (default: 0) + --ppl-output-type {0,1} output type for perplexity calculation (default: 0) + +parallel: + + -dt, --defrag-thold N KV cache defragmentation threshold (default: -1.0, < 0 - disabled) + -np, --parallel N number of parallel sequences to decode (default: 1) + -ns, --sequences N number of sequences to decode (default: 1) + -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled) + +multi-modality: + + --mmproj FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md + --image FILE path to an image file. use with multimodal models. Specify multiple times for batching + +backend: + + --rpc SERVERS comma separated list of RPC servers + --mlock force system to keep model in RAM rather than swapping or compressing + --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock) + --numa TYPE attempt optimizations that help on some NUMA systems + - distribute: spread execution evenly over all nodes + - isolate: only spawn threads on CPUs on the node that execution started on + - numactl: use the CPU map provided by numactl + if run without this previously, it is recommended to drop the system page cache before using this + see https://github.com/ggerganov/llama.cpp/issues/1437 + +model: + + --check-tensors check model tensor data for invalid values (default: false) + --override-kv KEY=TYPE:VALUE + advanced option to override model metadata by key. may be specified multiple times. + types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false + --lora FNAME apply LoRA adapter (implies --no-mmap) + --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap) + --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter + --control-vector FNAME add a control vector + note: this argument can be repeated to add multiple control vectors + --control-vector-scaled FNAME SCALE + add a control vector with user defined scaling SCALE + note: this argument can be repeated to add multiple scaled control vectors + --control-vector-layer-range START END + layer range to apply the control vector(s) to, start and end inclusive + -m, --model FNAME model path (default: models/$filename with filename from --hf-file + or --model-url if set, otherwise models/7B/ggml-model-f16.gguf) + -md, --model-draft FNAME draft model for speculative decoding (default: unused) + -mu, --model-url MODEL_URL model download url (default: unused) + -hfr, --hf-repo REPO Hugging Face model repository (default: unused) + -hff, --hf-file FILE Hugging Face model file (default: unused) + -hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment variable) + +retrieval: + + --context-file FNAME file to load context from (repeat to specify multiple files) + --chunk-size N minimum length of embedded text chunks (default: 64) + --chunk-separator STRING + separator between chunks (default: ' + ') + +passkey: + + --junk N number of times to repeat the junk text (default: 250) + --pos N position of the passkey in the junk text (default: -1) + +imatrix: + + -o, --output FNAME output file (default: 'imatrix.dat') + --output-frequency N output the imatrix every N iterations (default: 10) + --save-frequency N save an imatrix copy every N iterations (default: 0) + --process-output collect data for the output tensor (default: false) + --no-ppl do not compute perplexity (default: true) + --chunk N start processing the input from chunk N (default: 0) + +bench: + + -pps is the prompt shared across parallel sequences (default: false) + -npp n0,n1,... number of prompt tokens + -ntg n0,n1,... number of text generation tokens + -npl n0,n1,... number of parallel prompts + +embedding: + + --embd-normalize normalisation for embendings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) + --embd-output-format empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix + --embd-separator separator of embendings (default \n) for example "<#sep#>" + +server: + + --host HOST ip address to listen (default: 127.0.0.1) + --port PORT port to listen (default: 8080) + --path PATH path to serve static files from (default: ) + --embedding(s) enable embedding endpoint (default: disabled) + --api-key KEY API key to use for authentication (default: none) + --api-key-file FNAME path to file containing API keys (default: none) + --ssl-key-file FNAME path to file a PEM-encoded SSL private key + --ssl-cert-file FNAME path to file a PEM-encoded SSL certificate + --timeout N server read/write timeout in seconds (default: 600) + --threads-http N number of threads used to process HTTP requests (default: -1) + --system-prompt-file FNAME + set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications + --log-format {text,json} + log output format: json or text (default: json) + --metrics enable prometheus compatible metrics endpoint (default: disabled) + --no-slots disables slots monitoring endpoint (default: enabled) + --slot-save-path PATH path to save slot kv cache (default: disabled) + --chat-template JINJA_TEMPLATE + set custom jinja chat template (default: template taken from model's metadata) + only commonly used templates are accepted: + https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template + -sps, --slot-prompt-similarity SIMILARITY + how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled) + + +logging: + + --simple-io use basic IO for better compatibility in subprocesses and limited consoles + -ld, --logdir LOGDIR path under which to save YAML logs (no logging if unset) + --log-test Run simple logging test + --log-disable Disable trace logs + --log-enable Enable trace logs + --log-file FNAME Specify a log filename (without extension) + --log-new Create a separate new log file on start. Each log file will have unique name: "..log" + --log-append Don't truncate the old log file. + +cvector: + + -o, --output FNAME output file (default: 'control_vector.gguf') + --positive-file FNAME positive prompts file, one prompt per line (default: 'examples/cvector-generator/positive.txt') + --negative-file FNAME negative prompts file, one prompt per line (default: 'examples/cvector-generator/negative.txt') + --pca-batch N batch size used for PCA. Larger batch runs faster, but uses more memory (default: 100) + --pca-iter N number of iterations used for PCA (default: 1000) + --method {pca,mean} dimensionality reduction method to be used (default: pca) +``` -**If compiled with `LLAMA_SERVER_SSL=ON`** -- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key -- `--ssl-cert-file FNAME`: path to file a PEM-encoded SSL certificate ## Build From 8fac431b0692e88cdc55250f29f8d4386be82c5d Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Mon, 15 Jul 2024 14:48:17 +0200 Subject: [PATCH 006/170] ggml : suppress unknown pragma 'GCC' on windows (#8460) This commit adds a macro guard to pragma GCC to avoid the following warning on windows: ```console C:\llama.cpp\ggml\src\ggml-aarch64.c(17,9): warning C4068: unknown pragma 'GCC' [C:\lama.cpp\build\ggml\src\ggml.vcxproj] ``` --- ggml/src/ggml-aarch64.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c index 40838cf4f..26535b1c4 100644 --- a/ggml/src/ggml-aarch64.c +++ b/ggml/src/ggml-aarch64.c @@ -14,7 +14,9 @@ #include "ggml-aarch64.h" +#if defined(__GNUC__) #pragma GCC diagnostic ignored "-Woverlength-strings" +#endif #define UNUSED GGML_UNUSED From 4db8f60fe79a391e82b0464013ada123baced96a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 15 Jul 2024 19:23:10 +0200 Subject: [PATCH 007/170] fix ci (#8494) --- examples/server/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index d7487ec1b..e477d1501 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -211,7 +211,7 @@ retrieval: --context-file FNAME file to load context from (repeat to specify multiple files) --chunk-size N minimum length of embedded text chunks (default: 64) - --chunk-separator STRING + --chunk-separator STRING separator between chunks (default: ' ') @@ -256,7 +256,7 @@ server: --threads-http N number of threads used to process HTTP requests (default: -1) --system-prompt-file FNAME set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications - --log-format {text,json} + --log-format {text,json} log output format: json or text (default: json) --metrics enable prometheus compatible metrics endpoint (default: disabled) --no-slots disables slots monitoring endpoint (default: enabled) From 97bdd26eee11fe109dec00de75690ceef61c03f2 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 15 Jul 2024 20:50:47 +0200 Subject: [PATCH 008/170] Refactor lora adapter support (#8332) * lora: load to devide buft * add patch tensor function * correct tensor patch * llama_lora_adapter_apply * correct ggml_backend_tensor_copy * add llm_build_mm * fix auto merge * update based on review comments * add convert script * no more transpose A * add f16 convert * add metadata check * add sanity check * fix ftype * add requirements * fix requirements * fix outfile * conversion: only allow selected models * fix types * cuda : do not use dmmv if the tensor does not have enough cols * llama : lora fixes * do not disable mmap with lora Co-authored-by: slaren * llm_build_lora_mm_id * convert_lora : MoE LoRA conversion support * convert_lora : prefer safetensors, similarly to convert_hf * convert_hf : simplify modify_tensors for InternLM2 * convert_lora : lazy conversion * llama : load and use alpha from LoRA adapters * llama : use llm_build_lora_mm in most model graphs * auto scale * Revert "auto scale" This reverts commit 42415a4874e0f963e4aca6796ea5dfb97cd17464. * remove redundant params * Apply suggestions from code review Co-authored-by: slaren * change kv metadata * move add_type to __init__ * convert_hf : move add_type to main() * convert_lora : use the GGUFWriter from Model instead of overwriting it --------- Co-authored-by: slaren Co-authored-by: Francis Couture-Harpin --- common/common.cpp | 13 +- convert_hf_to_gguf.py | 34 +- convert_lora_to_gguf.py | 374 ++++++ ggml/src/ggml-cuda.cu | 3 +- ggml/src/ggml.c | 4 +- gguf-py/gguf/constants.py | 10 + gguf-py/gguf/gguf_writer.py | 3 + gguf-py/gguf/quants.py | 2 +- include/llama.h | 37 +- requirements.txt | 1 + .../requirements-convert_lora_to_gguf.txt | 2 + src/llama.cpp | 1010 +++++++++-------- 12 files changed, 963 insertions(+), 530 deletions(-) create mode 100755 convert_lora_to_gguf.py create mode 100644 requirements/requirements-convert_lora_to_gguf.txt diff --git a/common/common.cpp b/common/common.cpp index 9035c3592..dbb724fbb 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -685,7 +685,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa if (arg == "--lora") { CHECK_ARG params.lora_adapter.emplace_back(argv[i], 1.0f); - params.use_mmap = false; return true; } if (arg == "--lora-scaled") { @@ -693,7 +692,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa const char* lora_adapter = argv[i]; CHECK_ARG params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); - params.use_mmap = false; return true; } if (arg == "--lora-base") { @@ -2089,19 +2087,14 @@ std::tuple llama_init_from_gpt_par for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); - int err = llama_model_apply_lora_from_file(model, - lora_adapter.c_str(), - lora_scale, - ((i > 0) || params.lora_base.empty()) - ? NULL - : params.lora_base.c_str(), - params.n_threads); - if (err != 0) { + auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str()); + if (adapter == nullptr) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); llama_free(lctx); llama_free_model(model); return std::make_tuple(nullptr, nullptr); } + llama_lora_adapter_set(lctx, adapter, lora_scale); } if (params.ignore_eos) { diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 42dace219..a755b0a60 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2264,13 +2264,6 @@ class InternLM2Model(Model): special_vocab.add_to_gguf(self.gguf_writer) - def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - def set_gguf_parameters(self): self.gguf_writer.add_name("InternLM2") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) @@ -2290,26 +2283,22 @@ class InternLM2Model(Model): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: num_heads = self.hparams["num_attention_heads"] num_kv_heads = self.hparams["num_key_value_heads"] - hidden_size = self.hparams["hidden_size"] + n_embd = self.hparams["hidden_size"] q_per_kv = num_heads // num_kv_heads - head_dim = hidden_size // num_heads + head_dim = n_embd // num_heads num_groups = num_heads // q_per_kv - qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv" - - if re.match(qkv_pattern, name): - bid = re.findall(qkv_pattern, name)[0] + if bid is not None and f"model.layers.{bid}.attention.wqkv" in name: qkv = data_torch - # qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) - qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim)) - q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] + + qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd)) + q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1] + # The model weights of q and k equire additional reshape. - # q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads) - q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads) - # k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads) - k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads) - # v = rearrange(v, " o g n i -> o (g n i)").T - v = v.reshape((v.shape[0], -1)).T + q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads) + k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads) + v = v.reshape((-1, v.shape[-1])) + return [ (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q), (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k), @@ -3585,6 +3574,7 @@ def main() -> None: small_first_shard=args.no_tensor_first_split) logger.info("Set model parameters") + model_instance.gguf_writer.add_type(gguf.GGUFType.MODEL) model_instance.set_gguf_parameters() logger.info("Set model tokenizer") diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py new file mode 100755 index 000000000..4bb939d45 --- /dev/null +++ b/convert_lora_to_gguf.py @@ -0,0 +1,374 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from __future__ import annotations + +from dataclasses import dataclass +import logging +import argparse +import os +import sys +import json +from math import prod +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) +import gguf + +# reuse model definitions from convert_hf_to_gguf.py +from convert_hf_to_gguf import LazyTorchTensor, Model + +logger = logging.getLogger("lora-to-gguf") + + +@dataclass +class PartialLoraTensor: + A: Tensor | None = None + B: Tensor | None = None + + +# magic to support tensor shape modifications and splitting +class LoraTorchTensor: + _lora_A: Tensor # (n_rank, row_size) + _lora_B: Tensor # (col_size, n_rank) + _rank: int + + def __init__(self, A: Tensor, B: Tensor): + assert len(A.shape) == len(B.shape) + assert A.shape[-2] == B.shape[-1] + if A.dtype != B.dtype: + A = A.to(torch.float32) + B = B.to(torch.float32) + self._lora_A = A + self._lora_B = B + self._rank = B.shape[-1] + + def get_lora_A_B(self) -> tuple[Tensor, Tensor]: + return (self._lora_A, self._lora_B) + + def __getitem__( + self, + indices: ( + SupportsIndex + | slice + | tuple[SupportsIndex | slice | Tensor, ...] # TODO: add ellipsis in the type signature + ), + ) -> LoraTorchTensor: + shape = self.shape + if isinstance(indices, SupportsIndex): + if len(shape) > 2: + return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) + else: + raise NotImplementedError # can't return a vector + elif isinstance(indices, slice): + if len(shape) > 2: + return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) + else: + return LoraTorchTensor(self._lora_A, self._lora_B[indices]) + elif isinstance(indices, tuple): + assert len(indices) > 0 + if indices[-1] is Ellipsis: + return self[indices[:-1]] + # expand ellipsis + indices = tuple( + u + for v in ( + ( + (slice(None, None) for _ in range(len(indices) - 1)) + if i is Ellipsis + else (i,) + ) + for i in indices + ) + for u in v + ) + + if len(indices) < len(shape): + indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape)))) + + # TODO: make sure this is correct + indices_A = ( + *( + ( + j.__index__() % self._lora_A.shape[i] + if isinstance(j, SupportsIndex) + else slice(None, None) + ) + for i, j in enumerate(indices[:-2]) + ), + slice(None, None), + indices[-1], + ) + indices_B = indices[:-1] + return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B]) + else: + raise NotImplementedError # unknown indice type + + @property + def dtype(self) -> torch.dtype: + assert self._lora_A.dtype == self._lora_B.dtype + return self._lora_A.dtype + + @property + def shape(self) -> tuple[int, ...]: + assert len(self._lora_A.shape) == len(self._lora_B.shape) + return (*self._lora_B.shape[:-1], self._lora_A.shape[-1]) + + def size(self, dim=None): + assert dim is None + return self.shape + + def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor: + if isinstance(shape[0], tuple): + new_shape: tuple[int, ...] = shape[0] + else: + new_shape = cast(tuple[int, ...], shape) + orig_shape = self.shape + if len(new_shape) < 2: + raise NotImplementedError # can't become a vector + + # expand -1 in the shape + if any(dim == -1 for dim in new_shape): + n_elems = prod(orig_shape) + n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape) + assert n_elems % n_new_elems == 0 + new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),) + + if new_shape[-1] != orig_shape[-1]: + raise NotImplementedError # can't reshape the row size trivially + + shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1]) + shape_B = (*new_shape[:-1], self._rank) + return LoraTorchTensor( + self._lora_A.reshape(shape_A), + self._lora_B.reshape(shape_B), + ) + + def reshape_as(self, other: Tensor) -> LoraTorchTensor: + return self.reshape(*other.shape) + + def view(self, *size: int) -> LoraTorchTensor: + return self.reshape(*size) + + def permute(self, *dims: int) -> LoraTorchTensor: + shape = self.shape + dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims) + if dims[-1] == -1: + # TODO: support higher dimensional A shapes bigger than 1 + assert all(dim == 1 for dim in self._lora_A.shape[:-2]) + return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims)) + if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1: + return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims)) + else: + # TODO: compose the above two + raise NotImplementedError + + def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor: + shape = self.shape + dims = [i for i in range(len(shape))] + dims[dim0], dims[dim1] = dims[dim1], dims[dim0] + return self.permute(*dims) + + def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor: + return self.transpose(axis0, axis1) + + def to(self, *args, **kwargs): + return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs)) + + @classmethod + def __torch_function__(cls, func: Callable, types, args=(), kwargs=None): + del types # unused + + if kwargs is None: + kwargs = {} + + if func is torch.permute: + return type(args[0]).permute(*args, **kwargs) + elif func is torch.reshape: + return type(args[0]).reshape(*args, **kwargs) + elif func is torch.stack: + assert isinstance(args[0], Sequence) + dim = kwargs.get("dim", 0) + assert dim == 0 + return LoraTorchTensor( + torch.stack([a._lora_A for a in args[0]], dim), + torch.stack([b._lora_B for b in args[0]], dim), + ) + elif func is torch.cat: + assert isinstance(args[0], Sequence) + dim = kwargs.get("dim", 0) + assert dim == 0 + if len(args[0][0].shape) > 2: + return LoraTorchTensor( + torch.cat([a._lora_A for a in args[0]], dim), + torch.cat([b._lora_B for b in args[0]], dim), + ) + elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]): + return LoraTorchTensor( + args[0][0]._lora_A, + torch.cat([b._lora_B for b in args[0]], dim), + ) + else: + raise NotImplementedError + else: + raise NotImplementedError + + +def get_base_tensor_name(lora_tensor_name: str) -> str: + base_name = lora_tensor_name.replace("base_model.model.", "") + base_name = base_name.replace(".lora_A.weight", ".weight") + base_name = base_name.replace(".lora_B.weight", ".weight") + return base_name + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file") + parser.add_argument( + "--outfile", type=Path, + help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", + ) + parser.add_argument( + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", + help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", + ) + parser.add_argument( + "--bigendian", action="store_true", + help="model is executed on big endian machine", + ) + parser.add_argument( + "--no-lazy", action="store_true", + help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)", + ) + parser.add_argument( + "--verbose", action="store_true", + help="increase output verbosity", + ) + parser.add_argument( + "--base", type=Path, required=True, + help="directory containing base model file", + ) + parser.add_argument( + "lora_path", type=Path, + help="directory containing LoRA adapter file", + ) + + return parser.parse_args() + + +if __name__ == '__main__': + args = parse_args() + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + ftype_map: dict[str, gguf.LlamaFileType] = { + "f32": gguf.LlamaFileType.ALL_F32, + "f16": gguf.LlamaFileType.MOSTLY_F16, + "bf16": gguf.LlamaFileType.MOSTLY_BF16, + "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, + "auto": gguf.LlamaFileType.GUESSED, + } + + ftype = ftype_map[args.outtype] + + dir_base_model: Path = args.base + dir_lora: Path = args.lora_path + lora_config = dir_lora / "adapter_config.json" + input_model = dir_lora / "adapter_model.safetensors" + + if args.outfile is not None: + fname_out = args.outfile + else: + # output in the same directory as the model by default + fname_out = dir_lora / 'ggml-lora-{ftype}.gguf' + + if os.path.exists(input_model): + # lazy import load_file only if lora is in safetensors format. + from safetensors.torch import load_file + + lora_model = load_file(input_model, device="cpu") + else: + input_model = os.path.join(dir_lora, "adapter_model.bin") + lora_model = torch.load(input_model, map_location="cpu", weights_only=True) + + # load base model + logger.info(f"Loading base model: {dir_base_model.name}") + hparams = Model.load_hparams(dir_base_model) + with torch.inference_mode(): + try: + model_class = Model.from_model_architecture(hparams["architectures"][0]) + except NotImplementedError: + logger.error(f"Model {hparams['architectures'][0]} is not supported") + sys.exit(1) + + class LoraModel(model_class): + model_arch = model_class.model_arch + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + tensor_map: dict[str, PartialLoraTensor] = {} + + for name, tensor in lora_model.items(): + if self.lazy: + tensor = LazyTorchTensor.from_eager(tensor) + base_name = get_base_tensor_name(name) + is_lora_a = ".lora_A.weight" in name + is_lora_b = ".lora_B.weight" in name + if not is_lora_a and not is_lora_b: + if ".base_layer.weight" in name: + continue + logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor") + sys.exit(1) + + if base_name in tensor_map: + if is_lora_a: + tensor_map[base_name].A = tensor + else: + tensor_map[base_name].B = tensor + else: + if is_lora_a: + tensor_map[base_name] = PartialLoraTensor(A=tensor) + else: + tensor_map[base_name] = PartialLoraTensor(B=tensor) + + for name, tensor in tensor_map.items(): + assert tensor.A is not None + assert tensor.B is not None + yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B))) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + dest = super().modify_tensors(data_torch, name, bid) + for dest_name, dest_data in dest: + assert isinstance(dest_data, LoraTorchTensor) + lora_a, lora_b = dest_data.get_lora_A_B() + + yield (dest_name + ".lora_a", lora_a) + yield (dest_name + ".lora_b", lora_b) + + model_instance = LoraModel( + dir_base_model, + ftype, + fname_out, + is_big_endian=args.bigendian, + use_temp_file=False, + eager=args.no_lazy, + model_name=None, + ) + + with open(lora_config, "r") as f: + lparams: dict[str, Any] = json.load(f) + + alpha = lparams["lora_alpha"] + + model_instance.gguf_writer.add_string(gguf.Keys.General.TYPE, gguf.GGUFType.ADAPTER) + model_instance.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") + model_instance.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha)) + model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + logger.info("Exporting model...") + model_instance.write() + logger.info(f"Model successfully exported to {model_instance.fname_out}") diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index ed784ea1c..39e345b66 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -1876,7 +1876,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 - && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1; + && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2 + && src1->ne[1] == 1; bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 9a5414787..60b3c5e7a 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -19478,7 +19478,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fprintf(fp, "digraph G {\n"); fprintf(fp, " newrank = true;\n"); - fprintf(fp, " rankdir = LR;\n"); + fprintf(fp, " rankdir = TB;\n"); for (int i = 0; i < gb->n_nodes; i++) { struct ggml_tensor * node = gb->nodes[i]; @@ -19540,7 +19540,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph } fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]); - if (ggml_nelements(node) < 5) { + if (ggml_nelements(node) < 5 && node->data != NULL) { fprintf(fp, " | ("); for (int j = 0; j < ggml_nelements(node); j++) { if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) { diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index a95a44237..5eb3df706 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -19,6 +19,7 @@ GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h class Keys: class General: + TYPE = "general.type" ARCHITECTURE = "general.architecture" QUANTIZATION_VERSION = "general.quantization_version" ALIGNMENT = "general.alignment" @@ -120,11 +121,20 @@ class Keys: MIDDLE_ID = "tokenizer.ggml.middle_token_id" EOT_ID = "tokenizer.ggml.eot_token_id" + class Adapter: + TYPE = "adapter.type" + LORA_ALPHA = "adapter.lora.alpha" + # # recommended mapping of model tensor names for storage in gguf # +class GGUFType: + MODEL = "model" + ADAPTER = "adapter" + + class MODEL_ARCH(IntEnum): LLAMA = auto() FALCON = auto() diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index cf9554162..b0197961d 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -424,6 +424,9 @@ class GGUFWriter: fout.close() self.fout = None + def add_type(self, type_name: str) -> None: + self.add_string(Keys.General.TYPE, type_name) + def add_architecture(self) -> None: self.add_string(Keys.General.ARCHITECTURE, self.arch) diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index b22eec166..16e0a9aaa 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -43,7 +43,7 @@ def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np. osize *= dim out = np.empty(shape=osize, dtype=otype) # compute over groups of 16 rows (arbitrary, but seems good for performance) - n_groups = rows.shape[0] // 16 + n_groups = (rows.shape[0] // 16) or 1 np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out) return out.reshape(oshape) diff --git a/include/llama.h b/include/llama.h index 3970c3aeb..c57d21f0c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -411,6 +411,9 @@ extern "C" { const char * content; } llama_chat_message; + // lora adapter + struct llama_lora_adapter; + // Helpers for getting default parameters LLAMA_API struct llama_model_params llama_model_default_params(void); LLAMA_API struct llama_context_params llama_context_default_params(void); @@ -510,18 +513,28 @@ extern "C" { const char * fname_out, const llama_model_quantize_params * params); - // Apply a LoRA adapter to a loaded model - // path_base_model is the path to a higher quality model to use as a base for - // the layers modified by the adapter. Can be NULL to use the current loaded model. - // The model needs to be reloaded before applying a new adapter, otherwise the adapter - // will be applied on top of the previous one - // Returns 0 on success - LLAMA_API int32_t llama_model_apply_lora_from_file( - const struct llama_model * model, - const char * path_lora, - float scale, - const char * path_base_model, - int32_t n_threads); + // Load a LoRA adapter from file + // The loaded adapter will be associated to the given model, and will be free when the model is deleted + LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init( + struct llama_model * model, + const char * path_lora); + + // Add a loaded LoRA adapter to given context + // This will not modify model's weight + LLAMA_API int32_t llama_lora_adapter_set( + struct llama_context * ctx, + struct llama_lora_adapter * adapter, + float scale); + + // Remove a LoRA adapter from given context + // Return -1 if the adapter is not present in the context + LLAMA_API int32_t llama_lora_adapter_remove( + struct llama_context * ctx, + struct llama_lora_adapter * adapter); + + // Manually free a LoRA adapter + // Note: loaded adapters will be free when the associated model is deleted + LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter); // Apply a loaded control vector to a llama_context, or if data is NULL, clear // the currently loaded vector. diff --git a/requirements.txt b/requirements.txt index 52456c2e6..9e190ae27 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ -r ./requirements/requirements-convert_hf_to_gguf.txt -r ./requirements/requirements-convert_hf_to_gguf_update.txt -r ./requirements/requirements-convert_llama_ggml_to_gguf.txt +-r ./requirements/requirements-convert_lora_to_gguf.txt diff --git a/requirements/requirements-convert_lora_to_gguf.txt b/requirements/requirements-convert_lora_to_gguf.txt new file mode 100644 index 000000000..5758076c4 --- /dev/null +++ b/requirements/requirements-convert_lora_to_gguf.txt @@ -0,0 +1,2 @@ +-r ./requirements-convert_hf_to_gguf.txt +--extra-index-url https://download.pytorch.org/whl/cpu diff --git a/src/llama.cpp b/src/llama.cpp index ddf0262d4..07bb42713 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -287,6 +287,7 @@ static const std::map LLM_ARCH_NAMES = { }; enum llm_kv { + LLM_KV_GENERAL_TYPE, LLM_KV_GENERAL_ARCHITECTURE, LLM_KV_GENERAL_QUANTIZATION_VERSION, LLM_KV_GENERAL_ALIGNMENT, @@ -377,9 +378,13 @@ enum llm_kv { LLM_KV_TOKENIZER_SUFFIX_ID, LLM_KV_TOKENIZER_MIDDLE_ID, LLM_KV_TOKENIZER_EOT_ID, + + LLM_KV_ADAPTER_TYPE, + LLM_KV_ADAPTER_LORA_ALPHA, }; static const std::map LLM_KV_NAMES = { + { LLM_KV_GENERAL_TYPE, "general.type" }, { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" }, { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" }, { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" }, @@ -470,6 +475,9 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, + + { LLM_KV_ADAPTER_TYPE, "adapter.type" }, + { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, }; struct LLM_KV { @@ -2703,6 +2711,9 @@ struct llama_model { int64_t t_load_us = 0; int64_t t_start_us = 0; + // keep track of loaded lora adapters + std::set lora_adapters; + ~llama_model() { for (struct ggml_context * ctx : ctxs) { ggml_free(ctx); @@ -2715,6 +2726,9 @@ struct llama_model { #endif ggml_backend_buffer_free(buf); } + while (!lora_adapters.empty()) { + llama_lora_adapter_free(*lora_adapters.begin()); + } } }; @@ -2819,6 +2833,52 @@ struct llama_context { // control vectors struct llama_control_vector cvec; + + // lora adapters and scales + std::unordered_map lora_adapters; +}; + +struct llama_lora_weight { + struct ggml_tensor * a = nullptr; + struct ggml_tensor * b = nullptr; + llama_lora_weight() = default; + llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} +}; + +struct llama_lora_adapter { + struct llama_model * base_model; + // map tensor name to lora_a_b + std::unordered_map ab_map; + std::vector ctxs; + std::vector bufs; + + float alpha; + + llama_lora_adapter(struct llama_model * base_model): base_model(base_model) { + base_model->lora_adapters.insert(this); + } + + llama_lora_weight * get_weight(struct ggml_tensor * w) { + std::string name(w->name); + auto pos = ab_map.find(name); + if (ab_map.find(name) != ab_map.end()) { + return &pos->second; + } + return nullptr; + } + + ~llama_lora_adapter() { + for (struct ggml_context * ctx : ctxs) { + ggml_free(ctx); + } + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } + auto pos = base_model->lora_adapters.find(this); + if (pos != base_model->lora_adapters.end()) { + base_model->lora_adapters.erase(pos); + } + } }; static size_t llama_get_device_count(const llama_model & model) { @@ -7809,6 +7869,58 @@ static void llm_build_kv_store( ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view)); } +// do mat_mul, while optionally apply lora +static struct ggml_tensor * llm_build_lora_mm( + struct llama_context & lctx, + struct ggml_context * ctx0, + struct ggml_tensor * w, + struct ggml_tensor * cur) { + struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); + for (auto & it : lctx.lora_adapters) { + struct llama_lora_weight * lora = it.first->get_weight(w); + if (lora == nullptr) { + continue; + } + const float alpha = it.first->alpha; + const float rank = (float) lora->b->ne[0]; + const float scale = alpha ? it.second * alpha / rank : it.second; + struct ggml_tensor * ab_cur = ggml_mul_mat( + ctx0, lora->b, + ggml_mul_mat(ctx0, lora->a, cur) + ); + ab_cur = ggml_scale(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + return res; +} + +// do mat_mul_id, while optionally apply lora +static struct ggml_tensor * llm_build_lora_mm_id( + struct llama_context & lctx, + struct ggml_context * ctx0, + struct ggml_tensor * w, // struct ggml_tensor * as + struct ggml_tensor * cur, // struct ggml_tensor * b + struct ggml_tensor * ids) { + struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); + for (auto & it : lctx.lora_adapters) { + struct llama_lora_weight * lora = it.first->get_weight(w); + if (lora == nullptr) { + continue; + } + const float alpha = it.first->alpha; + const float rank = (float) lora->b->ne[0]; + const float scale = alpha ? it.second * alpha / rank : it.second; + struct ggml_tensor * ab_cur = ggml_mul_mat_id( + ctx0, lora->b, + ggml_mul_mat_id(ctx0, lora->a, cur, ids), + ids + ); + ab_cur = ggml_scale(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + return res; +} + static struct ggml_tensor * llm_build_norm( struct ggml_context * ctx, struct ggml_tensor * cur, @@ -7843,6 +7955,7 @@ static struct ggml_tensor * llm_build_norm( static struct ggml_tensor * llm_build_ffn( struct ggml_context * ctx, + struct llama_context & lctx, struct ggml_tensor * cur, struct ggml_tensor * up, struct ggml_tensor * up_b, @@ -7858,7 +7971,7 @@ static struct ggml_tensor * llm_build_ffn( llm_ffn_gate_type type_gate, const llm_build_cb & cb, int il) { - struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur; + struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur; cb(tmp, "ffn_up", il); if (up_b) { @@ -7875,12 +7988,12 @@ static struct ggml_tensor * llm_build_ffn( switch (type_gate) { case LLM_FFN_SEQ: { - cur = ggml_mul_mat(ctx, gate, tmp); + cur = llm_build_lora_mm(lctx, ctx, gate, tmp); cb(cur, "ffn_gate", il); } break; case LLM_FFN_PAR: { - cur = ggml_mul_mat(ctx, gate, cur); + cur = llm_build_lora_mm(lctx, ctx, gate, cur); cb(cur, "ffn_gate", il); } break; } @@ -7948,7 +8061,7 @@ static struct ggml_tensor * llm_build_ffn( } if (down) { - cur = ggml_mul_mat(ctx, down, cur); + cur = llm_build_lora_mm(lctx, ctx, down, cur); } if (down_b) { @@ -7969,6 +8082,7 @@ static struct ggml_tensor * llm_build_ffn( static struct ggml_tensor * llm_build_moe_ffn( struct ggml_context * ctx, + struct llama_context & lctx, struct ggml_tensor * cur, struct ggml_tensor * gate_inp, struct ggml_tensor * up_exps, @@ -7985,7 +8099,7 @@ static struct ggml_tensor * llm_build_moe_ffn( int64_t n_embd = cur->ne[0]; int64_t n_tokens = cur->ne[1]; - ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens] + ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens] cb(logits, "ffn_moe_logits", il); ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens] @@ -8017,10 +8131,10 @@ static struct ggml_tensor * llm_build_moe_ffn( } cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens); - ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] cb(up, "ffn_moe_up", il); - ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] cb(gate, "ffn_moe_gate", il); switch (type_op) { @@ -8041,7 +8155,7 @@ static struct ggml_tensor * llm_build_moe_ffn( ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens] cb(par, "ffn_moe_gate_par", il); - ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] + ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] cb(experts, "ffn_moe_down", il); experts = ggml_mul(ctx, experts, weights); @@ -8069,9 +8183,7 @@ static struct ggml_tensor * llm_build_moe_ffn( static struct ggml_tensor * llm_build_kqv( struct ggml_context * ctx, - const llama_model & model, - const llama_hparams & hparams, - const llama_cparams & cparams, + struct llama_context & lctx, const llama_kv_cache & kv, struct ggml_cgraph * graph, struct ggml_tensor * wo, @@ -8083,6 +8195,10 @@ static struct ggml_tensor * llm_build_kqv( float kq_scale, const llm_build_cb & cb, int il) { + const llama_model & model = lctx.model; + const llama_hparams & hparams = lctx.model.hparams; + const llama_cparams & cparams = lctx.cparams; + const int64_t n_ctx = cparams.n_ctx; const int64_t n_head = hparams.n_head(il); const int64_t n_head_kv = hparams.n_head_kv(il); @@ -8181,7 +8297,7 @@ static struct ggml_tensor * llm_build_kqv( ggml_build_forward_expand(graph, cur); if (wo) { - cur = ggml_mul_mat(ctx, wo, cur); + cur = llm_build_lora_mm(lctx, ctx, wo, cur); } if (wo_b) { @@ -8197,9 +8313,7 @@ static struct ggml_tensor * llm_build_kqv( static struct ggml_tensor * llm_build_kv( struct ggml_context * ctx, - const llama_model & model, - const llama_hparams & hparams, - const llama_cparams & cparams, + struct llama_context & lctx, const llama_kv_cache & kv, struct ggml_cgraph * graph, struct ggml_tensor * wo, @@ -8214,6 +8328,8 @@ static struct ggml_tensor * llm_build_kv( float kq_scale, const llm_build_cb & cb, int il) { + const llama_hparams & hparams = lctx.model.hparams; + const llama_cparams & cparams = lctx.cparams; // these nodes are added to the graph together so that they are not reordered // by doing so, the number of splits in the graph is reduced @@ -8225,7 +8341,7 @@ static struct ggml_tensor * llm_build_kv( struct ggml_tensor * cur; - cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b, + cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il); cb(cur, "kqv_out", il); @@ -8687,21 +8803,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -8722,7 +8838,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -8745,7 +8861,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -8759,7 +8875,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(ctx0, cur, + cur = llm_build_moe_ffn(ctx0, lctx, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -8789,7 +8905,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -8825,13 +8941,13 @@ struct llm_build_context { // self-attention { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); switch (model.type) { @@ -8857,7 +8973,7 @@ struct llm_build_context { cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -8879,7 +8995,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -8904,7 +9020,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -8940,13 +9056,13 @@ struct llm_build_context { // self-attention { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -8962,7 +9078,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -8984,7 +9100,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -9007,7 +9123,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -9056,7 +9172,7 @@ struct llm_build_context { cur = attn_norm; } - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); @@ -9083,7 +9199,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9100,7 +9216,7 @@ struct llm_build_context { // feed forward { - cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result + cur = llm_build_ffn(ctx0, lctx, attn_norm, // !! use the attn norm, not the result model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -9127,7 +9243,7 @@ struct llm_build_context { LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -9172,21 +9288,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -9207,7 +9323,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -9239,7 +9355,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_moe_ffn(ctx0, cur, + cur = llm_build_moe_ffn(ctx0, lctx, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -9278,7 +9394,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); // Grok // multiply logits by output_multiplier_scale of 0.5773502691896257 @@ -9329,7 +9445,7 @@ struct llm_build_context { struct ggml_tensor * Kcur = nullptr; struct ggml_tensor * Vcur = nullptr; - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); @@ -9357,7 +9473,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9380,7 +9496,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "attn_out_norm", il); - cur = llm_build_moe_ffn(ctx0, cur, + cur = llm_build_moe_ffn(ctx0, lctx, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -9409,7 +9525,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -9451,7 +9567,7 @@ struct llm_build_context { // self-attention { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -9467,7 +9583,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9491,7 +9607,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -9514,7 +9630,7 @@ struct llm_build_context { LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -9546,13 +9662,13 @@ struct llm_build_context { // self-attention { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); @@ -9561,7 +9677,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cb(Qcur, "Qcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9583,7 +9699,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -9608,7 +9724,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -9660,7 +9776,7 @@ struct llm_build_context { // self-attention if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) { - Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq); + Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur), model.layers[il].bq); cb(Qcur, "Qcur", il); if (model.layers[il].attn_q_norm) { @@ -9670,7 +9786,7 @@ struct llm_build_context { LLM_NORM, cb, il); } - Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk); + Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur), model.layers[il].bk); cb(Kcur, "Kcur", il); if (model.layers[il].attn_k_norm) { @@ -9679,14 +9795,14 @@ struct llm_build_context { model.layers[il].attn_k_norm_b, LLM_NORM, cb, il); } - Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv); + Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur), model.layers[il].bv); cb(Vcur, "Vcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); } else { // compute Q and K and RoPE them - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); @@ -9735,7 +9851,7 @@ struct llm_build_context { ggml_build_forward_expand(gf, cur); - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); if (model.layers[il].bo) { cb(cur, "kqv_wo", il); } @@ -9768,21 +9884,21 @@ struct llm_build_context { // feed-forward network if (model.arch == LLM_ARCH_BERT) { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_PAR, cb, il); } else { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -9840,7 +9956,7 @@ struct llm_build_context { // self-attention { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -9856,7 +9972,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -9880,7 +9996,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -9903,7 +10019,7 @@ struct llm_build_context { LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -9950,7 +10066,7 @@ struct llm_build_context { { cur = attn_norm; - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); if (model.layers[il].bqkv){ @@ -9988,13 +10104,13 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } else { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10018,7 +10134,7 @@ struct llm_build_context { model.layers[il].ffn_norm_b, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -10043,7 +10159,7 @@ struct llm_build_context { LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -10083,21 +10199,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -10139,7 +10255,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10167,7 +10283,7 @@ struct llm_build_context { // parallel residual cur = inpSA; } - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -10193,7 +10309,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -10228,7 +10344,7 @@ struct llm_build_context { // self-attention { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -10258,7 +10374,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10280,7 +10396,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -10305,7 +10421,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -10343,17 +10459,17 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); @@ -10372,7 +10488,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10393,7 +10509,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -10417,7 +10533,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -10458,17 +10574,17 @@ struct llm_build_context { // self_attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); @@ -10487,7 +10603,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10510,7 +10626,7 @@ struct llm_build_context { cb(cur, "ffn_norm", il); ggml_tensor * moe_out = - llm_build_moe_ffn(ctx0, cur, + llm_build_moe_ffn(ctx0, lctx, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -10523,14 +10639,14 @@ struct llm_build_context { // FFN shared expert { - ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur); + ggml_tensor * cur_gate_inp = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_gate_inp_shexp, cur); cb(cur_gate_inp, "ffn_shexp_gate_inp", il); // sigmoid ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp); cb(cur_gate, "ffn_shexp_gate", il); - ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur, + ggml_tensor * cur_ffn = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -10563,7 +10679,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -10605,7 +10721,7 @@ struct llm_build_context { struct ggml_tensor * Vcur = nullptr; if (model.layers[il].wqkv) { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -10615,9 +10731,9 @@ struct llm_build_context { Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); } else { - Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); - Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); - Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); + Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); + Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); + Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); } cb(Qcur, "Qcur", il); @@ -10644,7 +10760,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -10659,7 +10775,7 @@ struct llm_build_context { // FF { - ffn_output = llm_build_ffn(ctx0, attn_norm_output, + ffn_output = llm_build_ffn(ctx0, lctx, attn_norm_output, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -10683,7 +10799,7 @@ struct llm_build_context { LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output_no_bias", -1); cur = ggml_add(ctx0, cur, model.output_b); @@ -10729,7 +10845,7 @@ struct llm_build_context { struct ggml_tensor * Vcur = nullptr; if (model.layers[il].wqkv) { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output); cb(cur, "wqkv", il); Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd))); @@ -10737,9 +10853,9 @@ struct llm_build_context { Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa))); } else { - Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); - Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); - Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); + Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); + Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); + Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); } cb(Qcur, "Qcur", il); @@ -10764,7 +10880,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -10788,7 +10904,7 @@ struct llm_build_context { // special-case: the up and gate tensors are merged into a single tensor // TOOD: support into llm_build_ffn { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -10811,7 +10927,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -10851,13 +10967,13 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -10872,7 +10988,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10890,7 +11006,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -10916,7 +11032,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -10958,7 +11074,7 @@ struct llm_build_context { // self-attention { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -10974,7 +11090,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -10998,7 +11114,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -11021,7 +11137,7 @@ struct llm_build_context { LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -11057,7 +11173,7 @@ struct llm_build_context { // self-attention { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -11085,7 +11201,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11109,7 +11225,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -11132,7 +11248,7 @@ struct llm_build_context { LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -11170,21 +11286,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); // if (model.layers[il].bq) { // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); // cb(Qcur, "Qcur", il); // } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); // if (model.layers[il].bk) { // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); // cb(Kcur, "Kcur", il); // } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); // if (model.layers[il].bv) { // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -11205,7 +11321,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11226,7 +11342,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11250,7 +11366,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -11288,21 +11404,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -11323,7 +11439,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11344,7 +11460,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11368,7 +11484,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -11419,21 +11535,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -11454,7 +11570,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11481,7 +11597,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11515,7 +11631,7 @@ struct llm_build_context { cb(cur, "lmhead_scaling", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -11552,13 +11668,13 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -11576,7 +11692,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -11598,7 +11714,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11623,7 +11739,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -11665,13 +11781,13 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -11694,7 +11810,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il); } @@ -11721,7 +11837,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -11751,7 +11867,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); // final logit soft-capping cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); @@ -11796,21 +11912,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -11831,7 +11947,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -11853,7 +11969,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -11877,7 +11993,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -11929,7 +12045,7 @@ struct llm_build_context { cb(cur, "attn_norm", il); // {n_embd, 2*d_inner} * {n_embd, n_tokens} => {2*d_inner, n_tokens} - struct ggml_tensor * xz = ggml_mul_mat(ctx0, model.layers[il].ssm_in, cur); + struct ggml_tensor * xz = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_in, cur); // split the above in two // => {d_inner, n_tokens} struct ggml_tensor * x = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], 0); @@ -11968,14 +12084,14 @@ struct llm_build_context { // ssm { // {d_inner, dt_rank + 2*d_state} * {d_inner, n_tokens} => {dt_rank + 2*d_state, n_tokens} - struct ggml_tensor * x_db = ggml_mul_mat(ctx0, model.layers[il].ssm_x, x); + struct ggml_tensor * x_db = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_x, x); // split struct ggml_tensor * dt = ggml_view_2d(ctx0, x_db, dt_rank, n_tokens, x_db->nb[1], 0); struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank); struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state)); // {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens} - dt = ggml_mul_mat(ctx0, model.layers[il].ssm_dt, dt); + dt = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_dt, dt); dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b); // Custom operator to optimize the parallel associative scan @@ -12006,7 +12122,7 @@ struct llm_build_context { y = ggml_mul(ctx0, y, ggml_silu(ctx0, z)); // {d_inner, n_embd} * {d_inner, n_tokens} => {n_embd, n_tokens} - cur = ggml_mul_mat(ctx0, model.layers[il].ssm_out, y); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_out, y); } // residual @@ -12025,7 +12141,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -12064,21 +12180,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -12124,7 +12240,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12141,7 +12257,7 @@ struct llm_build_context { // feed-forward network { - cur = llm_build_ffn(ctx0, ffn_inp, + cur = llm_build_ffn(ctx0, lctx, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12168,7 +12284,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); if (f_logit_scale) { cur = ggml_scale(ctx0, cur, f_logit_scale); @@ -12221,21 +12337,21 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); if (hparams.f_clamp_kqv > 0.0f) { Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); cb(Qcur, "Qcur", il); } - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); if (hparams.f_clamp_kqv > 0.0f) { Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); cb(Kcur, "Kcur", il); } - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); if (hparams.f_clamp_kqv > 0.0f) { Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); @@ -12256,7 +12372,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, nullptr, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12278,7 +12394,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12304,7 +12420,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -12344,7 +12460,7 @@ struct llm_build_context { // self-attention { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens); @@ -12383,7 +12499,7 @@ struct llm_build_context { Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens); cb(Qcur, "Vcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12405,7 +12521,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12429,7 +12545,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -12464,7 +12580,7 @@ struct llm_build_context { // self-attention { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -12492,7 +12608,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12517,7 +12633,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -12548,7 +12664,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -12571,7 +12687,7 @@ struct llm_build_context { LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -12612,13 +12728,13 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); Qcur = ggml_rope_ext( @@ -12635,7 +12751,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } @@ -12657,7 +12773,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12674,7 +12790,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm_exps", il); - cur = llm_build_moe_ffn(ctx0, cur, + cur = llm_build_moe_ffn(ctx0, lctx, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -12703,7 +12819,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -12857,7 +12973,7 @@ struct llm_build_context { struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(k_states, "k_states", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); } @@ -12879,7 +12995,7 @@ struct llm_build_context { cb(cur, "ffn_norm", il); if ((uint32_t) il < hparams.n_layer_dense_lead) { - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -12889,7 +13005,7 @@ struct llm_build_context { } else { // MoE branch ggml_tensor * moe_out = - llm_build_moe_ffn(ctx0, cur, + llm_build_moe_ffn(ctx0, lctx, cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, @@ -12902,7 +13018,7 @@ struct llm_build_context { // FFN shared expert { - ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur, + ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up_shexp, NULL, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL, model.layers[il].ffn_down_shexp, NULL, NULL, @@ -12967,7 +13083,7 @@ struct llm_build_context { // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { @@ -12976,7 +13092,7 @@ struct llm_build_context { } // B1.K - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { @@ -12985,7 +13101,7 @@ struct llm_build_context { } // B1.V - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { @@ -13007,7 +13123,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, NULL, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); @@ -13016,7 +13132,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "attn_sub_norm", il); - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale); if (model.layers[il].bo) { cur = ggml_add(ctx0, cur, model.layers[il].bo); @@ -13040,7 +13156,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale, NULL, NULL, NULL, @@ -13053,7 +13169,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_sub_norm", il); - cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur); cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale); cb(cur, "ffn_down", il); @@ -13072,7 +13188,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.tok_embd, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.tok_embd, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -13174,7 +13290,7 @@ struct llm_build_context { cb(cur, "ffn_norm", il); // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up_enc, NULL, NULL, model.layers[il].ffn_gate_enc, NULL, NULL, model.layers[il].ffn_down_enc, NULL, NULL, @@ -13354,7 +13470,7 @@ struct llm_build_context { cb(cur, "ffn_norm", il); // T5 uses relu, flan-T5 uses gelu-gated - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -13420,7 +13536,7 @@ struct llm_build_context { // self-attention { - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -13436,7 +13552,7 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il); } @@ -13460,7 +13576,7 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, @@ -13479,7 +13595,7 @@ struct llm_build_context { LLM_NORM, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -13521,7 +13637,7 @@ struct llm_build_context { struct ggml_tensor * Kcur = nullptr; struct ggml_tensor * Vcur = nullptr; - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); @@ -13549,7 +13665,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur_rope", il); - cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, NULL, Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); @@ -13574,7 +13690,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, + cur = llm_build_ffn(ctx0, lctx, cur, model.layers[il].ffn_up, NULL, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, @@ -13594,7 +13710,7 @@ struct llm_build_context { LLM_NORM_RMS, cb, -1); cb(cur, "result_norm", -1); - cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -18458,284 +18574,212 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } -static int llama_apply_lora_from_file_internal( - const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads -) { - LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); +static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) { + LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); - const int64_t t_start_lora_us = ggml_time_us(); - - llama_file fin(path_lora, "rb"); - - // verify magic and version - { - uint32_t magic = fin.read_u32(); - if (magic != LLAMA_FILE_MAGIC_GGLA) { - LLAMA_LOG_ERROR("%s: bad file magic\n", __func__); - return 1; - } - - uint32_t format_version = fin.read_u32(); - if (format_version != 1) { - LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ ); - return 1; - } - } - - int32_t lora_r = fin.read_u32(); - int32_t lora_alpha = fin.read_u32(); - float scaling = scale * (float)lora_alpha / (float)lora_r; - - LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); - - // load base model - std::unique_ptr ml; - if (path_base_model) { - LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); - ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr)); - ml->init_mappings(/*prefetch*/ false); // no prefetching - } - - struct tensor_meta { - std::string name; - ggml_type type; - int32_t ne[2]; - size_t offset; + ggml_context * ctx = nullptr; + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ true, + /* .ctx = */ &ctx, }; - std::map tensor_meta_map; - - // load all tensor meta - while (true) { - if (fin.tell() == fin.size) { - // eof - break; - } - - int32_t n_dims; - int32_t name_len; - int32_t ftype; - - fin.read_raw(&n_dims, sizeof(n_dims)); - fin.read_raw(&name_len, sizeof(name_len)); - fin.read_raw(&ftype, sizeof(ftype)); - - if (n_dims != 1 && n_dims != 2) { - LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); - return 1; - } - - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - fin.read_raw(&ne[i], sizeof(ne[i])); - } - - std::string name; - { - GGML_ASSERT(name_len < GGML_MAX_NAME); - char buf[GGML_MAX_NAME]; - fin.read_raw(buf, name_len); - name = std::string(buf, name_len); - } - - // check for lora suffix - std::string lora_suffix; - if (name.length() > 6) { - lora_suffix = name.substr(name.length() - 6); - } - if (lora_suffix != ".loraA" && lora_suffix != ".loraB") { - LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); - return 1; - } - - // tensor type - ggml_type wtype; - switch (ftype) { - case 0: wtype = GGML_TYPE_F32; break; - case 1: wtype = GGML_TYPE_F16; break; - default: - { - LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n", - __func__, ftype); - return 1; - } - } - - // data offset - size_t offset = fin.tell(); - offset = (offset + 31) & -32; - - // skip tensor data - fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET); - - tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset }); + struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params); + if (!ctx_gguf) { + throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora)); } - bool warned = false; - int n_tensors = 0; - - // apply - ggml_backend_t backend_cpu = ggml_backend_cpu_init(); - if (backend_cpu == nullptr) { - LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__); - return 1; - } - ggml_backend_cpu_set_n_threads(backend_cpu, n_threads); - - std::vector> read_buf; - for (const auto & it : model.tensors_by_name) { - const std::string & base_name = it.first; - ggml_tensor * model_t = it.second; - - if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() || - tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) { - continue; - } - - tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA"); - tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB"); - - ggml_init_params lora_init_params = { - /* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(), - /* .mem_buffer */ nullptr, - /* .no_alloc */ true, + // check metadata + { + auto get_kv_str = [&](const std::string & key) -> std::string { + int id = gguf_find_key(ctx_gguf, key.c_str()); + return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); }; - ggml_context * lora_ctx = ggml_init(lora_init_params); - if (lora_ctx == nullptr) { - LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__); - ggml_backend_free(backend_cpu); - return 1; + auto get_kv_f32 = [&](const std::string & key) -> float { + int id = gguf_find_key(ctx_gguf, key.c_str()); + return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id); + }; + LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); + + auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE)); + if (general_type != "adapter") { + gguf_free(ctx_gguf); + throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type); } - // create tensors - ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]); - ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]); - ggml_set_name(loraA, metaA.name.c_str()); - ggml_set_name(loraB, metaB.name.c_str()); + auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE)); + auto general_arch = llm_arch_from_string(general_arch_str); + if (general_arch != model->arch) { + gguf_free(ctx_gguf); + throw std::runtime_error("model arch and LoRA arch mismatch"); + } - ggml_tensor * base_t; - if (ml) { - if (!ml->get_tensor_meta(base_name.c_str())) { - LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); - return 1; + auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE)); + if (adapter_type != "lora") { + gguf_free(ctx_gguf); + throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type); + } + + adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA)); + } + + int n_tensors = gguf_get_n_tensors(ctx_gguf); + + // contexts for each buffer type + std::map ctx_map; + auto get_ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { + auto it = ctx_map.find(buft); + if (it == ctx_map.end()) { + // add a new context + struct ggml_init_params params = { + /*.mem_size =*/ n_tensors*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context * buft_ctx = ggml_init(params); + ctx_map[buft] = buft_ctx; + return buft_ctx; + }; + return it->second; + }; + + // bundle lora_a and lora_b into pairs + std::map ab_map; + auto str_endswith = [](const std::string & str, const std::string & suffix) { + return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; + }; + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + std::string name(cur->name); + if (str_endswith(name, ".lora_a")) { + replace_all(name, ".lora_a", ""); + if (ab_map.find(name) == ab_map.end()) { + ab_map[name] = llama_lora_weight(cur, nullptr); + } else { + ab_map[name].a = cur; + } + } else if (str_endswith(name, ".lora_b")) { + replace_all(name, ".lora_b", ""); + if (ab_map.find(name) == ab_map.end()) { + ab_map[name] = llama_lora_weight(nullptr, cur); + } else { + ab_map[name].b = cur; } - base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str())); } else { - base_t = ggml_dup_tensor(lora_ctx, model_t); - } - ggml_set_name(base_t, base_name.c_str()); - - // allocate in backend buffer - ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); - if (lora_buf == nullptr) { - LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__); - return 1; - } - - // load tensor data - auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) { - read_buf.resize(ggml_nbytes(tensor)); - fin.seek(tensor_meta.offset, SEEK_SET); - fin.read_raw(read_buf.data(), ggml_nbytes(tensor)); - ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size()); - }; - load_tensor(metaA, loraA); - load_tensor(metaB, loraB); - - // load base model tensor data - if (ml) { - ml->load_data_for(base_t); - } else { - ggml_backend_tensor_copy(model_t, base_t); - } - - if (ggml_is_quantized(base_t->type) && !warned) { - LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " - "use a f16 or f32 base model with --lora-base\n", __func__); - warned = true; - } - - if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { - LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" - " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); - ggml_free(lora_ctx); - ggml_backend_buffer_free(lora_buf); - ggml_backend_free(backend_cpu); - return 1; - } - - auto build_lora_graph = [&]() { - // w = w + BA*s - ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); - ggml_set_name(BA, "BA"); - - if (scaling != 1.0f) { - BA = ggml_scale(lora_ctx, BA, scaling); - ggml_set_name(BA, "BA_scaled"); - } - - ggml_tensor * r; - r = ggml_add_inplace(lora_ctx, base_t, BA); - ggml_set_name(r, "r_add"); - - if (base_t->type != model_t->type) { - // convert the result to the model type - r = ggml_cast(lora_ctx, r, model_t->type); - ggml_set_name(r, "r_cast"); - } - - return r; - }; - - ggml_cgraph * gf = ggml_new_graph(lora_ctx); - ggml_tensor * r = build_lora_graph(); - ggml_build_forward_expand(gf, r); - - ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); - if (graph_buf == nullptr) { - LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__); - ggml_free(lora_ctx); - ggml_backend_buffer_free(lora_buf); - ggml_backend_free(backend_cpu); - return 1; - } - - ggml_backend_graph_compute(backend_cpu, gf); - - ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r)); - -#if 0 - // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU - //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE); - - // sched compute - ggml_build_forward_expand(gf, build_graph()); - ggml_backend_sched_init_measure(sched, gf); - - // create the graph again, since the previous one was destroyed by the measure - ggml_graph_clear(gf); - ggml_build_forward_expand(gf, build_graph()); - ggml_backend_sched_graph_compute(sched, gf); - ggml_backend_sched_free(sched); -#endif - - ggml_backend_buffer_free(lora_buf); - ggml_backend_buffer_free(graph_buf); - ggml_free(lora_ctx); - - n_tensors++; - if (n_tensors % 4 == 0) { - LLAMA_LOG_INFO("."); + gguf_free(ctx_gguf); + ggml_free(ctx); + throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); } } - ggml_backend_free(backend_cpu); + // add tensors + for (auto & it : ab_map) { + const std::string & name = it.first; + llama_lora_weight & w = it.second; - const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; - LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); + if (!w.a || !w.b) { + gguf_free(ctx_gguf); + ggml_free(ctx); + throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); + } + // device buft and device ctx + auto * model_tensor = llama_get_model_tensor(model, name.c_str()); + if (!model_tensor) { + gguf_free(ctx_gguf); + ggml_free(ctx); + throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); + } + struct ggml_context * dev_ctx = get_ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); + // validate tensor shape + if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { + gguf_free(ctx_gguf); + ggml_free(ctx); + throw std::runtime_error("tensor '" + name + "' has incorrect shape"); + } + if (w.a->ne[1] != w.b->ne[0]) { + gguf_free(ctx_gguf); + ggml_free(ctx); + throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); + } + // save tensor to adapter + struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); + struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); + ggml_set_name(tensor_a, w.a->name); + ggml_set_name(tensor_b, w.b->name); + adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b); + } + + // allocate tensors / buffers and zero + { + adapter.ctxs.reserve(ctx_map.size()); + adapter.bufs.reserve(ctx_map.size()); + for (auto it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx_dev = it.second; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft); + if (!buf) { + gguf_free(ctx_gguf); + ggml_free(ctx); + throw std::runtime_error("failed to allocate buffer for lora adapter\n"); + } + LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); + adapter.ctxs.push_back(ctx_dev); + adapter.bufs.push_back(buf); + } + } + + // set tensor data + { + llama_file gguf_file(path_lora, "rb"); + std::vector read_buf; + auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { + size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name)); + size_t size = ggml_nbytes(orig); + read_buf.resize(size); + gguf_file.seek(offs, SEEK_SET); + gguf_file.read_raw(read_buf.data(), size); + ggml_backend_tensor_set(dev, read_buf.data(), 0, size); + }; + for (auto & it : adapter.ab_map) { + auto orig = ab_map[it.first]; + auto dev = it.second; + set_tensor(orig.a, dev.a); + set_tensor(orig.b, dev.b); + } + } + + LLAMA_LOG_INFO("%s: loaded %ld tensors from lora file\n", __func__, adapter.ab_map.size()*2); + + // free ctx for reading gguf + gguf_free(ctx_gguf); + ggml_free(ctx); +} + +int32_t llama_lora_adapter_set( + struct llama_context * ctx, + struct llama_lora_adapter * adapter, + float scale) { + if (ctx->cparams.flash_attn) { + LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__); + return -1; + } + ctx->lora_adapters[adapter] = scale; return 0; } +int32_t llama_lora_adapter_remove( + struct llama_context * ctx, + struct llama_lora_adapter * adapter) { + auto pos = ctx->lora_adapters.find(adapter); + if (pos != ctx->lora_adapters.end()) { + ctx->lora_adapters.erase(pos); + return 0; + } + return -1; +} + +void llama_lora_adapter_free(struct llama_lora_adapter * adapter) { + delete adapter; +} + // // interface implementation // @@ -19514,12 +19558,14 @@ uint32_t llama_model_quantize( } } -int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) { +struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) { try { - return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads); + struct llama_lora_adapter * adapter = new llama_lora_adapter(model); + llama_lora_adapter_init_internal(model, path_lora, *adapter); + return adapter; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); - return 1; + return nullptr; } } From 7acfd4e8d55082c1b597dfc3ffe04fb5d530c6dc Mon Sep 17 00:00:00 2001 From: compilade Date: Mon, 15 Jul 2024 23:13:10 -0400 Subject: [PATCH 009/170] convert_hf : faster lazy safetensors (#8482) * convert_hf : faster lazy safetensors This makes '--dry-run' much, much faster. * convert_hf : fix memory leak in lazy MoE conversion The '_lazy' queue was sometimes self-referential, which caused reference cycles of objects old enough to avoid garbage collection until potential memory exhaustion. --- convert_hf_to_gguf.py | 48 ++++++++++++++++++++++---- gguf-py/gguf/lazy.py | 63 ++++++++++------------------------ gguf-py/gguf/tensor_mapping.py | 14 ++++---- 3 files changed, 65 insertions(+), 60 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a755b0a60..c2aba9097 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -148,9 +148,16 @@ class Model: tensor_names_from_parts.update(model_part.keys()) for name in model_part.keys(): - data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] - if self.lazy: - data = LazyTorchTensor.from_eager(data) + if self.is_safetensors: + if self.lazy: + data = model_part.get_slice(name) + data = LazyTorchTensor.from_safetensors_slice(data) + else: + data = model_part.get_tensor(name) + else: + data = model_part[name] + if self.lazy: + data = LazyTorchTensor.from_eager(data) yield name, data # only verify tensor name presence; it doesn't matter if they are not in the right files @@ -3424,19 +3431,46 @@ class LazyTorchTensor(gguf.LazyBase): torch.float32: np.float32, } + # used for safetensors slices + # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 + # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 + _dtype_str_map: dict[str, torch.dtype] = { + "F64": torch.float64, + "F32": torch.float32, + "BF16": torch.bfloat16, + "F16": torch.float16, + # "U64": torch.uint64, + "I64": torch.int64, + # "U32": torch.uint32, + "I32": torch.int32, + # "U16": torch.uint16, + "I16": torch.int16, + "U8": torch.uint8, + "I8": torch.int8, + "BOOL": torch.bool, + "F8_E4M3": torch.float8_e4m3fn, + "F8_E5M2": torch.float8_e5m2, + } + def numpy(self) -> gguf.LazyNumpyTensor: dtype = self._dtype_map[self.dtype] return gguf.LazyNumpyTensor( meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), - lazy=self._lazy, args=(self,), - func=(lambda s: s[0].numpy()) + func=(lambda s: s.numpy()) ) @classmethod - def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor: + def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor: return torch.empty(size=shape, dtype=dtype, device="meta") + @classmethod + def from_safetensors_slice(cls, st_slice: Any) -> Tensor: + dtype = cls._dtype_str_map[st_slice.get_dtype()] + shape: tuple[int, ...] = tuple(st_slice.get_shape()) + lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:]) + return cast(torch.Tensor, lazy) + @classmethod def __torch_function__(cls, func, types, args=(), kwargs=None): del types # unused @@ -3447,7 +3481,7 @@ class LazyTorchTensor(gguf.LazyBase): if func is torch.Tensor.numpy: return args[0].numpy() - return LazyTorchTensor._wrap_fn(func)(*args, **kwargs) + return cls._wrap_fn(func)(*args, **kwargs) def parse_args() -> argparse.Namespace: diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py index 6e266f34f..ac98d9a92 100644 --- a/gguf-py/gguf/lazy.py +++ b/gguf-py/gguf/lazy.py @@ -3,7 +3,6 @@ from abc import ABC, ABCMeta, abstractmethod import logging from typing import Any, Callable -from collections import deque import numpy as np from numpy.typing import DTypeLike @@ -74,20 +73,18 @@ class LazyBase(ABC, metaclass=LazyMeta): _tensor_type: type _meta: Any _data: Any | None - _lazy: deque[LazyBase] # shared within a graph, to avoid deep recursion when making eager _args: tuple - _func: Callable[[tuple], Any] | None + _kwargs: dict[str, Any] + _func: Callable[[Any], Any] | None - def __init__(self, *, meta: Any, data: Any | None = None, lazy: deque[LazyBase] | None = None, args: tuple = (), func: Callable[[tuple], Any] | None = None): + def __init__(self, *, meta: Any, data: Any | None = None, args: tuple = (), kwargs: dict[str, Any] | None = None, func: Callable[[Any], Any] | None = None): super().__init__() self._meta = meta self._data = data - self._lazy = lazy if lazy is not None else deque() self._args = args + self._kwargs = kwargs if kwargs is not None else {} self._func = func assert self._func is not None or self._data is not None - if self._data is None: - self._lazy.append(self) def __init_subclass__(cls) -> None: if "_tensor_type" not in cls.__dict__: @@ -117,6 +114,7 @@ class LazyBase(ABC, metaclass=LazyMeta): args = ((use_self,) if use_self is not None else ()) + args meta_args = LazyBase._recurse_apply(args, lambda t: t._meta) + # TODO: maybe handle tensors in kwargs too if isinstance(meta_noop, bool) and not meta_noop: try: @@ -140,23 +138,7 @@ class LazyBase(ABC, metaclass=LazyMeta): res = cls.meta_with_dtype_and_shape(meta_noop, res.shape) if isinstance(res, cls._tensor_type): - class CollectSharedLazy: - # emulating a static variable - shared_lazy: None | deque[LazyBase] = None - - @staticmethod - def collect_replace(t: LazyBase): - if CollectSharedLazy.shared_lazy is None: - CollectSharedLazy.shared_lazy = t._lazy - else: - CollectSharedLazy.shared_lazy.extend(t._lazy) - t._lazy = CollectSharedLazy.shared_lazy - - LazyBase._recurse_apply(args, CollectSharedLazy.collect_replace) - - shared_lazy = CollectSharedLazy.shared_lazy - - return cls(meta=cls.eager_to_meta(res), lazy=shared_lazy, args=args, func=lambda a: fn(*a, **kwargs)) + return cls(meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn) else: del res # not needed # non-tensor return likely relies on the contents of the args @@ -168,26 +150,18 @@ class LazyBase(ABC, metaclass=LazyMeta): @classmethod def to_eager(cls, t: Any) -> Any: def simple_to_eager(_t: LazyBase) -> Any: - def already_eager_to_eager(_t: LazyBase) -> Any: - assert _t._data is not None + if _t._data is not None: return _t._data - while _t._data is None: - lt = _t._lazy.popleft() - if lt._data is not None: - # Lazy tensor did not belong in the lazy queue. - # Weirdly only happens with Bloom models... - # likely because tensors aren't unique in the queue. - # The final output is still the same as in eager mode, - # so it's safe to ignore this. - continue - assert lt._func is not None - lt._args = cls._recurse_apply(lt._args, already_eager_to_eager) - lt._data = lt._func(lt._args) - # sanity check - assert lt._data is not None - assert lt._data.dtype == lt._meta.dtype - assert lt._data.shape == lt._meta.shape + # NOTE: there's a recursion limit in Python (usually 1000) + + assert _t._func is not None + _t._args = cls._recurse_apply(_t._args, simple_to_eager) + _t._data = _t._func(*_t._args, **_t._kwargs) + # sanity check + assert _t._data is not None + assert _t._data.dtype == _t._meta.dtype + assert _t._data.shape == _t._meta.shape return _t._data @@ -206,7 +180,7 @@ class LazyBase(ABC, metaclass=LazyMeta): @classmethod def from_eager(cls, t: Any) -> Any: if type(t) is cls: - # already eager + # already lazy return t elif isinstance(t, cls._tensor_type): return cls(meta=cls.eager_to_meta(t), data=t) @@ -228,8 +202,7 @@ class LazyNumpyTensor(LazyBase): def astype(self, dtype, *args, **kwargs): meta = type(self).meta_with_dtype_and_shape(dtype, self._meta.shape) full_args = (self, dtype,) + args - # very important to pass the shared _lazy deque, or else there's an infinite loop somewhere. - return type(self)(meta=meta, args=full_args, lazy=self._lazy, func=(lambda a: a[0].astype(*a[1:], **kwargs))) + return type(self)(meta=meta, args=full_args, kwargs=kwargs, func=(lambda a, *args, **kwargs: a.astype(*args, **kwargs))) def tofile(self, *args, **kwargs): eager = LazyNumpyTensor.to_eager(self) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 7264240f5..9aa2209e2 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -602,14 +602,12 @@ class TensorNameMap: for tensor, keys in self.block_mappings_cfg.items(): if tensor not in MODEL_TENSORS[arch]: continue - # TODO: make this configurable - n_experts = 160 - for xid in range(n_experts): - tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid) - self.mapping[tensor_name] = (tensor, tensor_name) - for key in keys: - key = key.format(bid = bid, xid = xid) - self.mapping[key] = (tensor, tensor_name) + + tensor_name = TENSOR_NAMES[tensor].format(bid = bid) + self.mapping[tensor_name] = (tensor, tensor_name) + for key in keys: + key = key.format(bid = bid) + self.mapping[key] = (tensor, tensor_name) def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None: result = self.mapping.get(key) From 0efec57787cb8d8d76b17cd3765e6521e22c1f19 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 16 Jul 2024 10:00:30 +0300 Subject: [PATCH 010/170] llama : valign + remove unused ftype (#8502) --- examples/quantize/quantize.cpp | 74 +++++++++++++++++----------------- include/llama.h | 2 +- src/llama.cpp | 72 ++++++++++++++++----------------- 3 files changed, 72 insertions(+), 76 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 1578c4afb..8d7647258 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -16,44 +16,44 @@ struct quant_option { }; static const std::vector QUANT_OPTIONS = { - { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", }, - { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", }, - { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", }, - { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", }, - { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", }, - { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", }, - { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, - { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, - { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, - { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, - { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, - { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, - { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, - { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, - { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, - { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", }, - { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", }, - { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", }, - { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", }, - { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", }, - { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, - { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, - { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", }, - { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", }, - { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, - { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", }, - { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, - { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, - { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, - { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, - { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, - { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, + { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", }, + { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", }, + { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", }, + { "IQ2_XXS", LLAMA_FTYPE_MOSTLY_IQ2_XXS, " 2.06 bpw quantization", }, + { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", }, + { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", }, + { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, + { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, + { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, + { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, + { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, + { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, + { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, + { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, + { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, + { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", }, + { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", }, + { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", }, + { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", }, + { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", }, + { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, + { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, + { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", }, + { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", }, + { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, + { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", }, + { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, + { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, + { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, + { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, + { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, + { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. - { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, + { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, }; static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file"; diff --git a/include/llama.h b/include/llama.h index c57d21f0c..c0fb53060 100644 --- a/include/llama.h +++ b/include/llama.h @@ -133,7 +133,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + // LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors diff --git a/src/llama.cpp b/src/llama.cpp index 07bb42713..4a9903cc3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4510,40 +4510,36 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { } switch (ftype) { - case LLAMA_FTYPE_ALL_F32: return "all F32"; - case LLAMA_FTYPE_MOSTLY_F16: return "F16"; - case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; - case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; - case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; - case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: - return "Q4_1, some F16"; - case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; - case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1"; - case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; - - // K-quants - case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small"; - case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; - case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; - case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small"; - case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; - case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; - case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; - case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ1_M :return "IQ1_M - 1.75 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; + case LLAMA_FTYPE_ALL_F32: return "all F32"; + case LLAMA_FTYPE_MOSTLY_F16: return "F16"; + case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; + case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; + case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; + case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; + case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1"; + case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; + case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; + case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; + case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8"; @@ -18069,10 +18065,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K; //} bool convert_incompatible_tensor = false; - if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || - new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS || - new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S || - new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S || + if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || + new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS || + new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S || + new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S || new_type == GGML_TYPE_IQ1_M) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; From 37b12f92ab696d70f9a65d7447ce721b094fb32e Mon Sep 17 00:00:00 2001 From: Steve Bonds Date: Tue, 16 Jul 2024 00:04:45 -0700 Subject: [PATCH 011/170] export-lora : handle help argument (#8497) The --help option on export-lora isn't accepted as valid. The help still gets displayed by default, but the script exits with an error message and nonzero status. --- examples/export-lora/export-lora.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 08413f57e..16f27aa77 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -190,6 +190,9 @@ static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_ if (params->n_threads <= 0) { params->n_threads = std::thread::hardware_concurrency(); } + } else if (arg == "-h" || arg == "--help") { + export_lora_print_usage(argc, argv, &default_params); + exit(0); } else { fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str()); export_lora_print_usage(argc, argv, &default_params); From 1666f92dcda14e002cbb08e1028f9fff341d73ad Mon Sep 17 00:00:00 2001 From: Brian Date: Tue, 16 Jul 2024 17:14:16 +1000 Subject: [PATCH 012/170] gguf-hash : update clib.json to point to original xxhash repo (#8491) * Update clib.json to point to Cyan4973 original xxhash Convinced Cyan4973 to add clib.json directly to his repo, so can now point the clib package directly to him now. Previously pointed to my fork with the clib.json package metadata https://github.com/Cyan4973/xxHash/pull/954 * gguf-hash: readme update to point to Cyan4973 xxHash repo [no ci] --- examples/gguf-hash/README.md | 2 +- examples/gguf-hash/deps/xxhash/clib.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/gguf-hash/README.md b/examples/gguf-hash/README.md index 2320ffd81..9871651e3 100644 --- a/examples/gguf-hash/README.md +++ b/examples/gguf-hash/README.md @@ -201,6 +201,6 @@ Verification results for test.gguf.manifest - Success These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs) -- https://github.com/mofosyne/xxHash (From: https://github.com/Cyan4973/xxHash) +- https://github.com/Cyan4973/xxHash - https://github.com/clibs/sha1/ - https://github.com/jb55/sha256.c diff --git a/examples/gguf-hash/deps/xxhash/clib.json b/examples/gguf-hash/deps/xxhash/clib.json index 0048c74dc..242343c5d 100644 --- a/examples/gguf-hash/deps/xxhash/clib.json +++ b/examples/gguf-hash/deps/xxhash/clib.json @@ -1,7 +1,7 @@ { "name": "xxhash", "version": "0.8.2", - "repo": "mofosyne/xxhash", + "repo": "Cyan4973/xxhash", "description": "Extremely fast non-cryptographic hash algorithm", "keywords": ["xxhash", "hashing"], "license": "BSD-2-Clause", From 5e116e8dd51775f8f1c090570be148d5d7eea6c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 16 Jul 2024 21:20:59 +0200 Subject: [PATCH 013/170] make/cmake: add missing force MMQ/cuBLAS for HIP (#8515) --- Makefile | 8 ++++++++ ggml/src/CMakeLists.txt | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/Makefile b/Makefile index bec332ec5..4584594af 100644 --- a/Makefile +++ b/Makefile @@ -795,6 +795,14 @@ ifdef GGML_CUDA_FORCE_DMMV HIPFLAGS += -DGGML_CUDA_FORCE_DMMV endif # GGML_CUDA_FORCE_DMMV +ifdef GGML_CUDA_FORCE_MMQ + HIPFLAGS += -DGGML_CUDA_FORCE_MMQ +endif # GGML_CUDA_FORCE_MMQ + +ifdef GGML_CUDA_FORCE_CUBLAS + HIPFLAGS += -DGGML_CUDA_FORCE_CUBLAS +endif # GGML_CUDA_FORCE_CUBLAS + ifdef GGML_CUDA_NO_PEER_COPY HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY endif # GGML_CUDA_NO_PEER_COPY diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 41e50ad2c..cbadaf4d9 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -440,6 +440,10 @@ if (GGML_HIPBLAS) add_compile_definitions(GGML_CUDA_FORCE_MMQ) endif() + if (GGML_CUDA_FORCE_CUBLAS) + add_compile_definitions(GGML_CUDA_FORCE_CUBLAS) + endif() + if (GGML_CUDA_NO_PEER_COPY) add_compile_definitions(GGML_CUDA_NO_PEER_COPY) endif() From d65a8361fed8be97389776fb94217e937ec6b03c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 17 Jul 2024 10:32:59 +0300 Subject: [PATCH 014/170] llama : disable context-shift for DeepSeek v2 (#8501) --- src/llama.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index 4a9903cc3..89b5763dc 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15133,6 +15133,10 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { // apply K-shift if needed if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { + if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA + GGML_ASSERT(false && "Deepseek2 does not support K-shift"); + } + { ggml_backend_sched_reset(lctx.sched); From da3913d8f9475b0d4bcbeb4936c724af4eade092 Mon Sep 17 00:00:00 2001 From: "Masaya, Kato" <62578291+msy-kato@users.noreply.github.com> Date: Wed, 17 Jul 2024 16:34:28 +0900 Subject: [PATCH 015/170] batched: fix n_predict parameter (#8527) --- examples/batched/batched.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 2442e954d..53fbfb0a8 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -31,7 +31,7 @@ int main(int argc, char ** argv) { int n_parallel = params.n_parallel; // total length of the sequences including the prompt - int n_predict = 32; + int n_predict = params.n_predict; // init LLM From 1bdd8ae19f50bc6f108fa247e90688e5c60559fc Mon Sep 17 00:00:00 2001 From: hipudding Date: Wed, 17 Jul 2024 19:23:50 +0800 Subject: [PATCH 016/170] [CANN] Add Ascend NPU backend (#6035) * [CANN] Add Ascend NPU backend Ascend is a full-stack AI computing infrastructure for industry applications and services based on Huawei Ascend processors and software. CANN (Compute Architecture of Neural Networks), developped by Huawei, is a heterogeneous computing architecture for AI. Co-authored-by: wangshuai09 <391746016@qq.com> * delete trailing whitespaces * Modify the code based on review comment * Rename LLAMA_CANN to GGML_CANN * Make ggml-common.h private * add ggml_cann prefix for acl funcs * Add logging for CANN backend * Delete Trailing whitespace --------- Co-authored-by: wangshuai09 <391746016@qq.com> --- CMakeLists.txt | 1 + examples/llama-bench/llama-bench.cpp | 15 + examples/llava/clip.cpp | 9 + ggml/include/ggml-cann.h | 125 + ggml/include/ggml.h | 3 + ggml/src/CMakeLists.txt | 69 + ggml/src/ggml-backend.c | 5 + ggml/src/ggml-cann.cpp | 2023 +++++++++++ ggml/src/ggml-cann/.clang-format | 168 + ggml/src/ggml-cann/Doxyfile | 2579 +++++++++++++++ ggml/src/ggml-cann/acl_tensor.cpp | 198 ++ ggml/src/ggml-cann/acl_tensor.h | 230 ++ ggml/src/ggml-cann/aclnn_ops.cpp | 2944 +++++++++++++++++ ggml/src/ggml-cann/aclnn_ops.h | 592 ++++ ggml/src/ggml-cann/common.h | 282 ++ ggml/src/ggml-cann/kernels/CMakeLists.txt | 32 + ggml/src/ggml-cann/kernels/ascendc_kernels.h | 17 + ggml/src/ggml-cann/kernels/dup.cpp | 223 ++ ggml/src/ggml-cann/kernels/get_row_f16.cpp | 186 ++ ggml/src/ggml-cann/kernels/get_row_f32.cpp | 180 + ggml/src/ggml-cann/kernels/get_row_q4_0.cpp | 193 ++ ggml/src/ggml-cann/kernels/get_row_q8_0.cpp | 191 ++ .../ggml-cann/kernels/quantize_f16_q8_0.cpp | 208 ++ .../ggml-cann/kernels/quantize_f32_q8_0.cpp | 206 ++ ggml/src/ggml.c | 11 +- src/llama.cpp | 41 + tests/test-backend-ops.cpp | 33 +- 27 files changed, 10756 insertions(+), 8 deletions(-) create mode 100644 ggml/include/ggml-cann.h create mode 100644 ggml/src/ggml-cann.cpp create mode 100644 ggml/src/ggml-cann/.clang-format create mode 100644 ggml/src/ggml-cann/Doxyfile create mode 100644 ggml/src/ggml-cann/acl_tensor.cpp create mode 100644 ggml/src/ggml-cann/acl_tensor.h create mode 100644 ggml/src/ggml-cann/aclnn_ops.cpp create mode 100644 ggml/src/ggml-cann/aclnn_ops.h create mode 100644 ggml/src/ggml-cann/common.h create mode 100644 ggml/src/ggml-cann/kernels/CMakeLists.txt create mode 100644 ggml/src/ggml-cann/kernels/ascendc_kernels.h create mode 100644 ggml/src/ggml-cann/kernels/dup.cpp create mode 100644 ggml/src/ggml-cann/kernels/get_row_f16.cpp create mode 100644 ggml/src/ggml-cann/kernels/get_row_f32.cpp create mode 100644 ggml/src/ggml-cann/kernels/get_row_q4_0.cpp create mode 100644 ggml/src/ggml-cann/kernels/get_row_q8_0.cpp create mode 100644 ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp create mode 100644 ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 67dcf86d4..793709122 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,6 +106,7 @@ llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE) llama_option_depr(WARNING LLAMA_RPC GGML_RPC) llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) +llama_option_depr(WARNING LLAMA_CANN GGML_CANN) # # build the library diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index d641a9f12..a6497b6e0 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -23,6 +23,10 @@ #include "ggml-cuda.h" #include "ggml-sycl.h" +#ifdef GGML_USE_CANN +#include "ggml-cann.h" +#endif + // utils static uint64_t get_time_ns() { using clock = std::chrono::high_resolution_clock; @@ -120,6 +124,17 @@ static std::string get_gpu_info() { id += "/"; } } +#endif +#ifdef GGML_USE_CANN + uint32_t count = ggml_backend_cann_get_device_count(); + for (uint32_t i = 0; i < count; i++) { + char buf[128]; + ggml_backend_cann_get_device_description(i, buf, sizeof(buf)); + id += buf; + if (i < count - 1) { + id += "/"; + } + } #endif // TODO: other backends return id; diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index d6882eec3..d23e282fb 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -16,6 +16,10 @@ #include "ggml-metal.h" #endif +#ifdef GGML_USE_CANN +#include "ggml-cann.h" +#endif + #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" @@ -1001,6 +1005,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { LOG_TEE("%s: CLIP using Metal backend\n", __func__); #endif +#ifdef GGML_USE_CANN + new_clip->backend = ggml_backend_cann_init(0); + LOG_TEE("%s: CLIP using CANN backend\n", __func__); +#endif + if (!new_clip->backend) { new_clip->backend = ggml_backend_cpu_init(); diff --git a/ggml/include/ggml-cann.h b/ggml/include/ggml-cann.h new file mode 100644 index 000000000..ca73211fe --- /dev/null +++ b/ggml/include/ggml-cann.h @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#include "ggml-backend.h" +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Maximum number of CANN devices supported. + */ +#define GGML_CANN_MAX_DEVICES 16 + +/** + * @brief Initializes the CANN backend for a specified device. + * + * This function initializes the CANN backend for the given device. + * It verifies the device index, allocates a context, and creates a backend + * instance. + * + * @param device The index of the device to initialize. + * @return A pointer to the initialized backend instance, or nullptr on failure. + */ +GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device); + +/** + * @brief Checks if a given backend is a CANN backend. + * + * This function verifies if the provided backend is a CANN backend by comparing + * its GUID with the CANN backend's GUID. + * + * @param backend The backend instance to check. + * @return True if the backend is a CANN backend, false otherwise. + */ +GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend); + +/** + * @brief Retrieves the CANN buffer type for a specified device. + * + * This function initializes and returns the buffer type interface associated + * with the given device. It ensures thread-safe access using a mutex. + * + * @param device The device index for which to retrieve the buffer type. + * @return A pointer to the buffer type interface for the specified device, or + * nullptr if the device index is out of range. + */ +GGML_API GGML_CALL ggml_backend_buffer_type_t +ggml_backend_cann_buffer_type(int32_t device); + +/** + * @brief Retrieves the number of CANN devices available. + * + * This function returns the number of CANN devices available based on + * information obtained from `ggml_cann_info()`. + * + * @return The number of CANN devices available. + */ +GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void); + +/** + * @brief Retrieves the description of a specific CANN device. + * + * This function sets the specified device, retrieves the SoC name, + * and writes it into the provided description buffer. + * + * @param device The device index to retrieve the description for. + * @param description Pointer to a buffer where the description will be written. + * @param description_size Size of the description buffer. + */ +GGML_API GGML_CALL void ggml_backend_cann_get_device_description( + int32_t device, char* description, size_t description_size); + +/** + * @brief Retrieves the memory information of a specific CANN device. + * + * This function sets the specified device, retrieves the free and total + * memory information of the specified type (ACL_HBM_MEM), and stores them + * in the provided pointers. + * + * @param device The device index to retrieve memory information for. + * @param free Pointer to a variable where the free memory size will be stored. + * @param total Pointer to a variable where the total memory size will be + * stored. + */ +GGML_API GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device, + size_t* free, + size_t* total); + +/** + * @brief Set the logging callback for GGML. + * + * This function sets the logging callback and user data for logging. + * + * @param log_callback The logging callback to set. + * @param user_data User data to pass to the logging callback. + */ +GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback, + void* user_data); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index f2145ff35..2fdb9fa40 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -753,6 +753,8 @@ extern "C" { GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1); GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1); + GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1); + // use this to compute the memory overhead of a tensor GGML_API size_t ggml_tensor_overhead(void); @@ -2397,6 +2399,7 @@ extern "C" { GGML_API int ggml_cpu_has_rpc (void); GGML_API int ggml_cpu_has_vsx (void); GGML_API int ggml_cpu_has_matmul_int8(void); + GGML_API int ggml_cpu_has_cann (void); // // Internal types and functions exposed for tests and benchmarks diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index cbadaf4d9..3f4c66bf7 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -770,6 +770,74 @@ if (GGML_CPU_HBM) target_link_libraries(ggml PUBLIC memkind) endif() +if (GGML_CANN) + if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME}) + set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME}) + message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}") + endif() + + if (CANN_INSTALL_DIR) + # Only Support Linux. + if (GGML_CANN) + if (NOT UNIX) + set(GGML_CANN OFF) + message(WARNING "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off GGML_CANN") + endif() + endif() + + # Supported platforms: x86-64, arm64 + if (GGML_CANN) + if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64") + else() + set(GGML_CANN OFF) + message(WARNING "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off GGML_CANN") + endif() + endif() + + # Set header and libs + if(GGML_CANN) + set(CANN_INCLUDE_DIRS + ${CANN_INSTALL_DIR}/include + ${CANN_INSTALL_DIR}/include/aclnn + ${CANN_INSTALL_DIR}/acllib/include + ) + + # TODO: find libs + link_directories( + ${CANN_INSTALL_DIR}/lib64 + ) + + add_subdirectory(ggml-cann/kernels) + list(APPEND CANN_LIBRARIES + ascendcl + nnopbase + opapi + acl_op_compiler + ascendc_kernels + ) + + set(GGML_HEADERS_CANN "../include/ggml-cann.h") + file(GLOB GGML_SOURCES_CANN "ggml-cann/*.cpp") + list(APPEND GGML_SOURCES_CANN "ggml-cann.cpp") + + message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}") + message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}") + + set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${CANN_LIBRARIES} ) + set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS}) + list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN) + endif() + else() + set(GGML_CANN OFF) + message(WARNING "CANN: Can't find CANN_INSTALL_DIR, do you forget to source set_var.sh. Turning off GGML_CANN") + endif() + + if(NOT GGML_CANN) + message(WARNING "CANN: GGML_CANN is turned OFF, see above for details.") + endif() +endif() + function(get_flags CCID CCVER) set(C_FLAGS "") set(CXX_FLAGS "") @@ -1184,6 +1252,7 @@ add_library(ggml ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} ${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS} ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} + ${GGML_SOURCES_CANN} ${GGML_HEADERS_CANN} ggml-aarch64.c ggml-aarch64.h ) diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index dbbaa3941..01c87efd0 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -445,6 +445,11 @@ GGML_CALL static void ggml_backend_registry_init(void) { extern GGML_CALL void ggml_backend_kompute_reg_devices(void); ggml_backend_kompute_reg_devices(); #endif + +#ifdef GGML_USE_CANN + extern GGML_CALL int ggml_backend_cann_reg_devices(void); + ggml_backend_cann_reg_devices(); +#endif } GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp new file mode 100644 index 000000000..9bf7e332a --- /dev/null +++ b/ggml/src/ggml-cann.cpp @@ -0,0 +1,2023 @@ +/* + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "ggml-cann.h" + +#include +#include + +#include +#include +#include +#include + +#include "ggml-backend-impl.h" +#include "ggml-cann/aclnn_ops.h" +#include "ggml-cann/common.h" + +#define GGML_COMMON_DECL_C + +#include "ggml-common.h" + +/** + * @brief Default logging callback for GGML. + * + * This function is the default logging callback that logs messages to stderr. + * + * @param level The log level. + * @param msg The log message. + * @param user_data User data passed to the callback. + */ +static void ggml_cann_default_log_callback(enum ggml_log_level level, + const char* msg, void* user_data) { + GGML_UNUSED(level); + GGML_UNUSED(user_data); + fprintf(stderr, "%s", msg); +} + +ggml_log_callback ggml_cann_log_callback = ggml_cann_default_log_callback; +void* ggml_cann_log_user_data = NULL; + +GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback, + void* user_data) { + ggml_cann_log_callback = log_callback; + ggml_cann_log_user_data = user_data; +} + +#define GGML_CANN_LOG_INFO(...) ggml_cann_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) +#define GGML_CANN_LOG_WARN(...) ggml_cann_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) +#define GGML_CANN_LOG_ERROR(...) \ + ggml_cann_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) + +GGML_ATTRIBUTE_FORMAT(2, 3) + +/** + * @brief Log a message using the current logging callback. + * + * This function formats a log message and passes it to the current logging + * callback. + * + * @param level The log level. + * @param format The format string for the log message. + * @param ... The arguments for the format string. + */ +static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) { + if (ggml_cann_log_callback != NULL) { + va_list args; + va_start(args, format); + char buffer[128]; + int len = vsnprintf(buffer, 128, format, args); + if (len < 128) { + ggml_cann_log_callback(level, buffer, ggml_cann_log_user_data); + } else { + // vsnprintf adds a null terminator + std::vector buffer2(len + 1); + va_end(args); + va_start(args, format); + vsnprintf(&buffer2[0], buffer2.size(), format, args); + ggml_cann_log_callback(level, buffer2.data(), + ggml_cann_log_user_data); + } + va_end(args); + } +} + +/** + * @brief Handles CANN errors by printing an error message and aborting. + * + * @param stmt The statement that caused the error. + * @param func The function in which the error occurred. + * @param file The file in which the error occurred. + * @param line The line number where the error occurred. + * @param msg The error message. + */ +[[noreturn]] void ggml_cann_error(const char* stmt, const char* func, + const char* file, int line, const char* msg) { + int32_t id = -1; + aclrtGetDevice(&id); + + GGML_CANN_LOG_ERROR("CANN error: %s\n", msg); + GGML_CANN_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, + file, line); + GGML_CANN_LOG_ERROR(" %s\n", stmt); + // abort with GGML_ASSERT to get a stack trace + GGML_ASSERT(!"CANN error"); +} + +/** + * @brief Sets the device to be used by CANN. + * + * @param device The device ID to set. + */ +void ggml_cann_set_device(const int32_t device) { + // TODO: uncomment these lines after empty context has fixed. + // int current_device; + // ACL_CHECK(aclrtGetDevice(¤t_device)); + + // if (device == current_device) { + // return; + // } + ACL_CHECK(aclrtSetDevice(device)); +} + +/** + * @brief Retrieves the current device ID. + * + * @return The current device ID. + */ +int32_t ggml_cann_get_device() { + int32_t id; + ACL_CHECK(aclrtGetDevice(&id)); + return id; +} + +/** + * @brief Initialize the CANN device information. + * + * This function initializes the CANN device information by obtaining the + * device count and setting the memory allocation granularity for each device. + * + * @return A structure containing the device information. + */ +static ggml_cann_device_info ggml_cann_init() { + ggml_cann_device_info info = {}; + + aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count); + + if (err != ACL_SUCCESS) { + GGML_CANN_LOG_ERROR("%s: failed to initialize CANN: %s\n", + __func__, aclGetRecentErrMsg()); + return info; + } + + GGML_ASSERT(info.device_count <= GGML_CANN_MAX_DEVICES); + + for (int id = 0; id < info.device_count; ++id) { + aclrtPhysicalMemProp prop = {}; + prop.handleType = ACL_MEM_HANDLE_TYPE_NONE; + prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED; + prop.memAttr = ACL_HBM_MEM_HUGE; + prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = id; + prop.reserve = 0; + ACL_CHECK(aclrtMemGetAllocationGranularity( + &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED, + &info.devices[id].vmm_granularity)); + } + + // TODO: add more device info later. + return info; +} + +/** + * @brief Retrieve the CANN device information. + * + * This function returns a reference to a structure containing the CANN device + * information. The device information is initialized once and reused on + * subsequent calls. + * + * @return A reference to the structure containing the device information. + */ +const ggml_cann_device_info& ggml_cann_info() { + static ggml_cann_device_info info = ggml_cann_init(); + return info; +} + +//#define DEBUG_CANN_MALLOC +/** + * @brief A pool of CANN buffers(legacy). + * + * This class manages a pool of CANN buffers for a specific device. + */ +struct ggml_cann_pool_leg : public ggml_cann_pool { + /** + * @brief The maximum number of buffers in the pool. + */ + static const int MAX_BUFFERS = 256; + + /** + * @brief The device ID associated with this buffer pool. + */ + int device; + + /** + * @brief Structure representing a CANN buffer. + */ + struct ggml_cann_buffer { + void* ptr = nullptr; ///< Pointer to the buffer memory. + size_t size = 0; ///< Size of the buffer. + }; + + /** + * @brief Array of CANN buffers in the pool. + */ + ggml_cann_buffer buffer_pool[MAX_BUFFERS] = {}; + + /** + * @brief Total size of all buffers in the pool. + */ + size_t pool_size = 0; + + /** + * @brief Constructor to initialize the buffer pool for a specific device. + * + * @param device The device ID to associate with this buffer pool. + */ + explicit ggml_cann_pool_leg(int device) : device(device) {} + + /** + * @brief Destructor to free all buffers in the pool. + */ + ~ggml_cann_pool_leg() { + ggml_cann_set_device(device); + for (int i = 0; i < MAX_BUFFERS; ++i) { + ggml_cann_buffer& b = buffer_pool[i]; + if (b.ptr != nullptr) { + ACL_CHECK(aclrtFree(b.ptr)); + pool_size -= b.size; + } + } + GGML_ASSERT(pool_size == 0); + } + + /** + * @brief Allocate a buffer of the given size. + * + * @param size The size of the buffer to allocate. + * @param actual_size A pointer to a variable to receive the actual size of + * the allocated buffer. + * @return A pointer to the allocated buffer. + */ + void* alloc(size_t size, size_t* actual_size) override { +#ifdef DEBUG_CANN_MALLOC + int nnz = 0; + size_t max_size = 0; +#endif + size_t best_diff = 1ull << 36; + int ibest = -1; + for (int i = 0; i < MAX_BUFFERS; ++i) { + ggml_cann_buffer& b = buffer_pool[i]; + if (b.ptr != nullptr) { +#ifdef DEBUG_CANN_MALLOC + ++nnz; + if (b.size > max_size) max_size = b.size; +#endif + if (b.size >= size) { + size_t diff = b.size - size; + if (diff < best_diff) { + best_diff = diff; + ibest = i; + if (!best_diff) { + void* ptr = b.ptr; + *actual_size = b.size; + b.ptr = nullptr; + b.size = 0; + return ptr; + } + } + } + } + } + if (ibest >= 0) { + ggml_cann_buffer& b = buffer_pool[ibest]; + void* ptr = b.ptr; + *actual_size = b.size; + b.ptr = nullptr; + b.size = 0; + return ptr; + } + void* ptr; + size_t look_ahead_size = (size_t)(1.05 * size); + look_ahead_size = 256 * ((look_ahead_size + 255) / 256); + ggml_cann_set_device(device); + ACL_CHECK( + aclrtMalloc(&ptr, look_ahead_size, ACL_MEM_MALLOC_HUGE_FIRST)); + *actual_size = look_ahead_size; + pool_size += look_ahead_size; +#ifdef DEBUG_CANN_MALLOC + GGML_CANN_LOG_INFO( + "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, " + "requested %u MB\n", + __func__, device, nnz, (uint32_t)(max_size / 1024 / 1024), + (uint32_t)(pool_size / 1024 / 1024), + (uint32_t)(size / 1024 / 1024)); +#endif + return ptr; + } + + /** + * @brief Free a buffer and return it to the pool. + * + * @param ptr Pointer to the buffer to free. + * @param size Size of the buffer to free. + */ + void free(void* ptr, size_t size) override { + for (int i = 0; i < MAX_BUFFERS; ++i) { + ggml_cann_buffer& b = buffer_pool[i]; + if (b.ptr == nullptr) { + b.ptr = ptr; + b.size = size; + return; + } + } + // memory should always buffered. these memory may still needed by + // tasks in stream. + // TODO, fix me. + GGML_ASSERT(!"Cann buffer pool full, increase MAX_CANN_BUFFERS\n"); + } +}; + +/** + * @brief A pool of CANN buffers with virtual memory. + * + * This class manages a pool of CANN buffers with virtual memory for a specific + * device. + */ +struct ggml_cann_pool_vmm : public ggml_cann_pool { + /** + * @brief The maximum size of the virtual memory pool (32 GB). + */ + static const size_t CANN_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB + + /** + * @brief The device ID associated with this buffer pool. + */ + int device; + + /** + * @brief Pointer to the start of the virtual memory pool. + */ + void* pool_addr = 0; + + /** + * @brief Amount of virtual memory used in the pool. + */ + size_t pool_used = 0; + + /** + * @brief Total size of the virtual memory pool. + */ + size_t pool_size = 0; + + /** + * @brief Allocation granularity for the virtual memory pool. + */ + size_t granularity; + + /** + * @brief Handles for the physical memory allocated. + */ + std::vector handles; + + /** + * @brief Offsets for the mapped memory regions. + */ + std::vector map_offsets; + + /** + * @brief Constructor to initialize the buffer pool with virtual memory for + * a specific device. + * + * @param device The device ID to associate with this buffer pool. + */ + explicit ggml_cann_pool_vmm(int device) + : device(device), + granularity(ggml_cann_info().devices[device].vmm_granularity) {} + + /** + * @brief Destructor to free all buffers in the virtual memory pool. + */ + ~ggml_cann_pool_vmm() { + if (pool_addr != 0) { + for (auto& offset : map_offsets) { + ACL_CHECK(aclrtUnmapMem(offset)); + } + for (auto& handle : handles) { + ACL_CHECK(aclrtFreePhysical(handle)); + } + ACL_CHECK(aclrtReleaseMemAddress(pool_addr)); + } + } + + /** + * @brief Allocate a buffer of the given size in the virtual memory pool. + * + * @param size The size of the buffer to allocate. + * @param actual_size A pointer to a variable to receive the actual size of + * the allocated buffer. + * @return A pointer to the allocated buffer. + */ + void* alloc(size_t size, size_t* actual_size) override { + // round up the allocation size to the alignment to ensure that all + // allocations are aligned for all data types + const size_t alignment = 128; + size = alignment * ((size + alignment - 1) / alignment); + + size_t avail = pool_size - pool_used; + + if (size > avail) { + // round up to the next multiple of the granularity + size_t reserve_size = size - avail; + reserve_size = + granularity * ((reserve_size + granularity - 1) / granularity); + + GGML_ASSERT(pool_size + reserve_size <= CANN_POOL_VMM_MAX_SIZE); + + // allocate more physical memory + aclrtPhysicalMemProp prop = {}; + prop.handleType = ACL_MEM_HANDLE_TYPE_NONE; + prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED; + prop.memAttr = ACL_HBM_MEM_HUGE; + prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + prop.reserve = 0; + aclrtDrvMemHandle handle; + ACL_CHECK(aclrtMallocPhysical(&handle, reserve_size, &prop, 0)); + + // reserve virtual address space (if not already reserved) + if (pool_addr == 0) { + ACL_CHECK(aclrtReserveMemAddress( + &pool_addr, CANN_POOL_VMM_MAX_SIZE, 0, NULL, 1)); + } + + // map at the end of the pool + ACL_CHECK(aclrtMapMem((char*)pool_addr + pool_size, reserve_size, 0, + handle, 0)); + + handles.push_back(handle); + map_offsets.push_back((char*)pool_addr + pool_size); + + // add to the pool + pool_size += reserve_size; + + // GGML_CANN_LOG_INFO("cann pool[%d]: size increased to %llu MB ( + // reserved %llu MB)\n", + // device, (unsigned long long) (pool_size/1024/1024), + // (unsigned long long) (reserve_size/1024/1024)); + } + + GGML_ASSERT(pool_addr != 0); + + void* ptr = (void*)((char*)pool_addr + pool_used); + *actual_size = size; + pool_used += size; + +#ifdef DEBUG_CANN_MALLOC + GGML_CANN_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device, + (unsigned long long)size, (unsigned long long)ptr); +#endif + return ptr; + } + + /** + * @brief Free a buffer and return it to the virtual memory pool. + * + * @param ptr Pointer to the buffer to free. + * @param size Size of the buffer to free. + */ + void free(void* ptr, size_t size) override { +#ifdef DEBUG_CANN_MALLOC + GGML_CANN_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device, + (unsigned long long)size, (unsigned long long)ptr); +#endif + + pool_used -= size; + + // all deallocations must be in reverse order of the allocations + GGML_ASSERT(ptr == (void*)((char*)pool_addr + pool_used)); + } +}; + +/** + * @brief Create a new CANN pool for a specific device. + * + * Factory method to create a new CANN pool object based on the device type. + * + * @param device The device ID for which to create the pool. + * @return A unique pointer to the created CANN pool. + */ +std::unique_ptr ggml_backend_cann_context::new_pool_for_device( + int device) { + // return std::unique_ptr(new ggml_cann_pool_leg(device)); + return std::unique_ptr(new ggml_cann_pool_vmm(device)); +} + +// cann buffer +/** + * @brief Context for managing a CANN buffer associated with a specific device. + * + * This structure holds information about a CANN buffer, including the device + * ID, device pointer, and a name derived from GGML_CANN_NAME and the device ID. + */ +struct ggml_backend_cann_buffer_context { + int32_t device; ///< The device ID associated with this buffer context. + void* dev_ptr = + nullptr; ///< Pointer to the device memory allocated for the buffer. + + /** + * @brief Constructor to initialize the CANN buffer context. + * + * @param device The device ID associated with this buffer context. + * @param dev_ptr Pointer to the device memory allocated for the buffer. + */ + ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr) + : device(device), + dev_ptr(dev_ptr) {} + + /** + * @brief Destructor to free the device memory allocated for the buffer. + */ + ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); } +}; + +/** + * @brief Retrieve the name associated with a CANN buffer. + * + * This function returns the name of a CANN buffer, which is stored in the + * context of the buffer. + * + * @param buffer The CANN buffer whose name is to be retrieved. + * @return A pointer to a C-string containing the name of the buffer. + */ + +GGML_CALL static const char* ggml_backend_cann_buffer_get_name( + ggml_backend_buffer_t buffer) { + return "CANN"; + + GGML_UNUSED(buffer); +} + +/** + * @brief Check if a buffer is a CANN buffer. + * + * This function checks if a given buffer is a CANN buffer by comparing its + * `get_name` function pointer to `ggml_backend_cann_buffer_get_name`. + * + * @param buffer The buffer to check. + * @return true if the buffer is a CANN buffer, false otherwise. + */ +GGML_CALL static bool ggml_backend_buffer_is_cann( + ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_cann_buffer_get_name; +} + +/** + * @brief Free resources associated with a CANN buffer. + * + * This function frees the resources associated with a CANN buffer, including + * its context. + * + * @param buffer The CANN buffer to free. + */ +GGML_CALL static void ggml_backend_cann_buffer_free_buffer( + ggml_backend_buffer_t buffer) { + ggml_backend_cann_buffer_context* ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + delete ctx; +} + +/** + * @brief Retrieve the base pointer of a CANN buffer. + * + * This function returns the base pointer of a CANN buffer, which points to the + * device memory allocated for the buffer. + * + * @param buffer The CANN buffer whose base pointer is to be retrieved. + * @return A pointer to the base of the device memory allocated for the buffer. + */ +GGML_CALL static void* ggml_backend_cann_buffer_get_base( + ggml_backend_buffer_t buffer) { + ggml_backend_cann_buffer_context* ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + return ctx->dev_ptr; +} + +/** + * @brief Transform quantized Q4.0 tensor data into a format suitable for CANN + * processing. + * + * This function transforms quantized Q4.0 tensor data into a format suitable + * for CANN processing. It extracts quantization values and scales from the + * source data and prepares them in a format expected by CANN operations. + * + * @param tensor Pointer to the tensor information. + * @param src Pointer to the source data in Q4.0 format. + * @param dst Pointer to the destination buffer where transformed data will be + * stored. + */ +GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, + const void* src, + void* dst) { + GGML_ASSERT(tensor->op == GGML_OP_NONE); + + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK4_0; + size_t quant_bytes = n_elems * sizeof(uint8_t) / 2; + + uint8_t* quant_offset = (uint8_t*)dst; + uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes); + + for (int i = 0; i < groups; i++) { + const block_q4_0* group = + (const block_q4_0*)((const char*)src + i * sizeof(block_q4_0)); + *scale_offset = group->d; + scale_offset++; + + // 0-15 + for (int j = 0; j < QK4_0 / 2; j += 2) { + (*quant_offset) = (group->qs[j] & 0x0F); + (*quant_offset) |= ((group->qs[j + 1] << 4)); + quant_offset++; + } + + // 16-31 + for (int j = 0; j < QK4_0 / 2; j += 2) { + (*quant_offset) = (group->qs[j] >> 4); + (*quant_offset) |= (group->qs[j + 1] & 0xF0); + quant_offset++; + } + } + + // put (uint4b_t -8) into int4b_t + for (quant_offset = (uint8_t*)dst; + quant_offset < (uint8_t*)dst + quant_bytes; quant_offset++) { + (*quant_offset) ^= 0x88; + } +} + +/** + * @brief Transform CANN processed data back into quantized Q4.0 format. + * + * This function transforms CANN processed data back into quantized Q4.0 format. + * It reverses the transformation performed by + * ggml_backend_cann_transform_q4_0(), converting the data back into its + * original quantized form. + * + * @param tensor Pointer to the tensor information. + * @param src Pointer to the source buffer containing transformed data. + * @param dst Pointer to the destination buffer where the Q4.0 formatted data + * will be stored. + */ +GGML_CALL static void ggml_backend_cann_transform_back_q4_0( + const ggml_tensor* tensor, void* src, void* dst) { + GGML_ASSERT(tensor->op == GGML_OP_NONE); + + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK4_0; + size_t quant_bytes = n_elems * sizeof(uint8_t) / 2; + + uint8_t* quant_offset = (uint8_t*)src; + uint16_t* scale_offset = (uint16_t*)((char*)src + quant_bytes); + + for (; quant_offset < (uint8_t*)src + quant_bytes; quant_offset++) { + (*quant_offset) ^= 0x88; + } + quant_offset = (uint8_t*)src; + + for (int i = 0; i < groups; i++) { + block_q4_0* group = (block_q4_0*)((char*)dst + i * sizeof(block_q4_0)); + group->d = *scale_offset; + scale_offset++; + + // 0-15 + for (int j = 0; j < QK4_0 / 2; j += 2) { + group->qs[j] = ((*quant_offset) & 0x0F); + group->qs[j + 1] = ((*quant_offset) >> 4); + quant_offset++; + } + + // 16-31 + for (int j = 0; j < QK4_0 / 2; j += 2) { + group->qs[j] |= ((*quant_offset) << 4); + group->qs[j + 1] |= ((*quant_offset) & 0xF0); + quant_offset++; + } + } +} + +/** + * @brief Transform quantized Q8.0 tensor data into a format suitable for CANN + * processing. + * + * This function transforms quantized Q8.0 tensor data into a format suitable + * for CANN processing. It extracts quantization values and scales from the + * source data and prepares them in a format expected by CANN operations. + * + * @param tensor Pointer to the tensor information. + * @param src Pointer to the source data in Q8.0 format. + * @param dst Pointer to the destination buffer where transformed data will be + * stored. + */ +GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor, + const void* src, + void* dst) { + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK8_0; + size_t quant_bytes = n_elems * sizeof(uint8_t); + + uint8_t* quant_offset = (uint8_t*)dst; + uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes); + + for (int i = 0; i < groups; i++) { + const block_q8_0* group = + (const block_q8_0*)((const char*)src + i * sizeof(block_q8_0)); + *scale_offset = group->d; + scale_offset++; + size_t group_quant_size = QK8_0 * sizeof(uint8_t); + memcpy(quant_offset, group->qs, group_quant_size); + quant_offset += group_quant_size; + } +} + +/** + * @brief Transform CANN processed data back into quantized Q8.0 format. + * + * This function transforms CANN processed data back into quantized Q8.0 format. + * It reverses the transformation performed by + * ggml_backend_cann_transform_q8_0(), converting the data back into its + * original quantized form. + * + * @param tensor Pointer to the tensor information. + * @param src Pointer to the source buffer containing transformed data. + * @param dst Pointer to the destination buffer where the Q8.0 formatted data + * will be stored. + */ +GGML_CALL static void ggml_backend_cann_transform_back_q8_0( + const ggml_tensor* tensor, const void* src, void* dst) { + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK8_0; + size_t quant_bytes = n_elems * sizeof(uint8_t); + + const uint8_t* quant_offset = (const uint8_t*)src; + const uint16_t* scale_offset = + (const uint16_t*)((const char*)src + quant_bytes); + + for (int i = 0; i < groups; i++) { + block_q8_0* group = (block_q8_0*)((char*)dst + i * sizeof(block_q8_0)); + group->d = *scale_offset; + scale_offset++; + size_t group_quant_size = QK8_0 * sizeof(uint8_t); + memcpy(group->qs, quant_offset, group_quant_size); + quant_offset += group_quant_size; + } +} + +/** + * @brief Transform tensor data based on its type for CANN processing. + * + * This function transforms tensor data based on its quantization type for CANN + * processing. It dispatches the transformation based on the tensor's type to + * specialized functions handling Q4.0 and Q8.0 formats. + * + * @param tensor Pointer to the tensor information. + * @param src Pointer to the source data to be transformed. + * @param dst Pointer to the destination buffer where transformed data will be + * stored. + */ +GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor, + const void* src, void* dst) { + switch (tensor->type) { + case GGML_TYPE_Q4_0: + ggml_backend_cann_transform_q4_0(tensor, src, dst); + break; + case GGML_TYPE_Q8_0: + ggml_backend_cann_transform_q8_0(tensor, src, dst); + break; + default: + break; + } +} + +/** + * @brief Transform CANN processed data back into tensor data based on its type. + * + * This function transforms CANN processed data back into tensor data based on + * its quantization type for Q4.0 and Q8.0 formats. It dispatches the + * transformation based on the tensor's type to specialized functions. + * + * @param tensor Pointer to the tensor information. + * @param src Pointer to the source data containing CANN processed data. + * @param dst Pointer to the destination buffer where transformed tensor data + * will be stored. + */ +GGML_CALL static void ggml_backend_cann_transform_back( + const ggml_tensor* tensor, void* src, void* dst) { + switch (tensor->type) { + case GGML_TYPE_Q4_0: + ggml_backend_cann_transform_back_q4_0(tensor, src, dst); + break; + case GGML_TYPE_Q8_0: + ggml_backend_cann_transform_back_q8_0(tensor, src, dst); + break; + default: + break; + } +} + +/** + * @brief Check if transformation is needed for a given tensor type. + * + * This function checks if transformation is needed for a given tensor type + * to prepare data for CANN processing. + * + * @param type The tensor type to check. + * @return true if transformation is needed, false otherwise. + */ +GGML_CALL static bool need_transform(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } +} + +/** + * @brief Initialize a tensor using data from a CANN buffer. + * + * This function initializes a tensor using data from a CANN buffer. + * It handles special cases such as views and quantization. + * + * @param buffer The CANN buffer from which to initialize the tensor. + * @param tensor Pointer to the tensor to be initialized. + */ +GGML_CALL static void ggml_backend_cann_buffer_init_tensor( + ggml_backend_buffer_t buffer, ggml_tensor* tensor) { + if (tensor->view_src != NULL && tensor->view_offs == 0) { + GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); + return; + } + + // TODO: can backend doesn't support quantized yet. Just leave the code + // here. + if (ggml_is_quantized(tensor->type)) { + // Initialize padding to 0 to avoid possible NaN values + size_t original_size = ggml_nbytes(tensor); + size_t padded_size = + ggml_backend_buft_get_alloc_size(buffer->buft, tensor); + + if (padded_size > original_size && tensor->view_src == nullptr) { + size_t memset_size = padded_size - original_size; + ACL_CHECK(aclrtMemset((char*)tensor->data + original_size, + memset_size, 0, memset_size)); + } + } +} + +// TODO: need handle tensor which has paddings. +/** + * @brief Set tensor data in a CANN buffer. + * + * This function sets tensor data in a CANN buffer, handling transformations + * if needed based on the tensor's type. + * + * @param buffer The CANN buffer where the tensor data will be set. + * @param tensor Pointer to the tensor whose data will be set. + * @param data Pointer to the source data to be copied into the tensor. + * @param offset Offset in the source data from where to start copying. + * @param size Size of the data to be copied, in bytes. + */ +GGML_CALL static void ggml_backend_cann_buffer_set_tensor( + ggml_backend_buffer_t buffer, ggml_tensor* tensor, const void* data, + size_t offset, size_t size) { + // GGML_ASSERT(size == ggml_nbytes(tensor)); + ggml_backend_cann_buffer_context* ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + + ggml_cann_set_device(ctx->device); + // TODO: refer to cann(#6017), it use thread's default stream. + // For acl, synchronous functions use this default stream. + // Why aclrtSynchronizeDevice? + + if (!need_transform(tensor->type)) { + ACL_CHECK(aclrtMemcpy(tensor->data, size, (const char*)data + offset, + size, ACL_MEMCPY_HOST_TO_DEVICE)); + } else { + void* transform_buffer = malloc(size); + ggml_backend_cann_transform(tensor, (const char*)data + offset, + transform_buffer); + +#ifndef NDEBUG + void* check_buffer = malloc(size); + ggml_backend_cann_transform_back(tensor, transform_buffer, + check_buffer); + GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size) == + 0); + free(check_buffer); +#endif + ACL_CHECK(aclrtMemcpy(tensor->data, size, transform_buffer, size, + ACL_MEMCPY_HOST_TO_DEVICE)); + free(transform_buffer); + } +} + +/** + * @brief Get tensor data from a CANN buffer. + * + * This function retrieves tensor data from a CANN buffer, handling + * transformations if needed based on the tensor's type. + * + * @param buffer The CANN buffer from which to retrieve tensor data. + * @param tensor Pointer to the tensor whose data will be retrieved. + * @param data Pointer to the destination buffer where the tensor data will be + * copied. + * @param offset Offset in the destination buffer where to start copying. + * @param size Size of the data to be copied, in bytes. + */ +GGML_CALL static void ggml_backend_cann_buffer_get_tensor( + ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data, + size_t offset, size_t size) { + GGML_ASSERT(size == ggml_nbytes(tensor)); + ggml_backend_cann_buffer_context* ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + + ggml_cann_set_device(ctx->device); + + if (!need_transform(tensor->type)) { + ACL_CHECK(aclrtMemcpy((char*)data + offset, size, tensor->data, size, + ACL_MEMCPY_DEVICE_TO_HOST)); + } else { + void* transform_buffer = malloc(size); + ACL_CHECK(aclrtMemcpy(transform_buffer, size, tensor->data, size, + ACL_MEMCPY_DEVICE_TO_HOST)); + ggml_backend_cann_transform_back(tensor, transform_buffer, + (char*)data + offset); + free(transform_buffer); + } +} + +/** + * @brief Copy tensor data between CANN buffers if possible. + * + * This function copies tensor data between CANN buffers if the source and + * destination buffers are CANN buffers and they meet the necessary conditions + * (same device or devices can access each other). + * + * @param buffer The destination CANN buffer where the tensor data will be + * copied. + * @param src Pointer to the source tensor whose data will be copied. + * @param dst Pointer to the destination tensor where the data will be copied. + * @return true if the copy operation succeeded, false otherwise. + */ +GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor( + ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) { + if (ggml_backend_buffer_is_cann(src->buffer)) { + ggml_backend_cann_buffer_context* src_ctx = + (ggml_backend_cann_buffer_context*)src->buffer->context; + ggml_backend_cann_buffer_context* dst_ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + + size_t memcpy_size = ggml_nbytes(src); + // Same device. + if (src_ctx->device == dst_ctx->device) { + ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size, + (const char*)src->data, memcpy_size, + ACL_MEMCPY_DEVICE_TO_DEVICE)); + return true; + } else { + // Different device but can access by peer. + int32_t canAccessPeer = 0; + ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device, + dst_ctx->device)); + if (canAccessPeer) { + ggml_cann_set_device(src_ctx->device); + ACL_CHECK(aclrtDeviceEnablePeerAccess(dst_ctx->device, 0)); + ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size, + (const char*)src->data, memcpy_size, + ACL_MEMCPY_DEVICE_TO_DEVICE)); + return true; + } + } + } + return false; +} + +/** + * @brief Clear a CANN buffer by setting all its memory to a specified value. + * + * This function clears a CANN buffer by setting all its memory to a specified + * value. + * + * @param buffer The CANN buffer to be cleared. + * @param value The value to which each byte in the buffer will be set. + */ +GGML_CALL static void ggml_backend_cann_buffer_clear( + ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_cann_buffer_context* ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + + ggml_cann_set_device(ctx->device); + ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size)); +} + +/** + * @brief Interface for a CANN buffer in the backend. + * + * This structure defines function pointers to operations that can be performed + * on a CANN buffer within the backend. + */ +static ggml_backend_buffer_i ggml_backend_cann_buffer_interface = { + /* .get_name = */ ggml_backend_cann_buffer_get_name, + /* .free_buffer = */ ggml_backend_cann_buffer_free_buffer, + /* .get_base = */ ggml_backend_cann_buffer_get_base, + /* .init_tensor = */ ggml_backend_cann_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_cann_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_cann_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_cann_buffer_cpy_tensor, + /* .clear = */ ggml_backend_cann_buffer_clear, + /* .reset = */ NULL, +}; + +// cann buffer type +/** + * @brief Structure representing context information for a specific backend + * buffer type. + */ +struct ggml_backend_cann_buffer_type_context { + int32_t + device; /**< Device identifier associated with the buffer context. */ + std::string name; /**< Name associated with the buffer context. */ +}; + +/** + * @brief Retrieves the name associated with a CANN buffer type. + * + * This function returns the descriptive name associated with the specified + * CANN buffer type context. + * + * @param buft Pointer to the buffer type context. + * @return Const pointer to the C-style string containing the name. + */ +GGML_CALL static const char* ggml_backend_cann_buffer_type_name( + ggml_backend_buffer_type_t buft) { + return "CANN"; + + GGML_UNUSED(buft); +} + +/** + * @brief Allocates a new CANN buffer of the specified type and size. + * + * This function allocates a new CANN buffer on the specified device with the + * given size. + * + * @param buft Pointer to the buffer type context. + * @param size Size in bytes of the buffer to allocate. + * @return Pointer to the allocated buffer, or nullptr if allocation fails. + */ +GGML_CALL static ggml_backend_buffer_t +ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, + size_t size) { + ggml_backend_cann_buffer_type_context* buft_ctx = + (ggml_backend_cann_buffer_type_context*)buft->context; + + ggml_cann_set_device(buft_ctx->device); + + size = std::max(size, (size_t)1); + + void* dev_ptr; + aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST); + if (err != ACL_SUCCESS) { + GGML_CANN_LOG_ERROR( + "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n", + __func__, size / 1024.0 / 1024.0, buft_ctx->device, + aclGetRecentErrMsg()); + return nullptr; + } + + ggml_backend_cann_buffer_context* ctx = + new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr); + + return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface, + ctx, size); +} + +/** + * @brief Retrieves the memory alignment requirement for CANN buffers of this + * type. + * + * This function returns the alignment requirement in bytes for memory allocated + * by the CANN buffer type. + * + * @param buft Pointer to the buffer type context (unused in this + * implementation). + * @return The alignment requirement in bytes (fixed at 128 bytes for CANN + * buffers). + */ +GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment( + ggml_backend_buffer_type_t buft) { + return 128; + + GGML_UNUSED(buft); +} + +/** + * @brief Calculates the allocation size required for a tensor in a CANN buffer. + * + * Computes the total allocation size needed for storing the tensor's data in a + * CANN buffer, considering any necessary padding or adjustments for quantized + * types. + * + * @param buft Pointer to the buffer type context (unused in this + * implementation). + * @param tensor Pointer to the tensor for which the allocation size is + * calculated. + * @return The total allocation size in bytes required for the tensor in the + * CANN buffer. + */ +GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alloc_size( + ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) { + size_t size = ggml_nbytes(tensor); + int64_t ne0 = tensor->ne[0]; + + // last line must bigger than 32, because every single op deal at + // least 32 bytes. + // TODO: quantized type? + // int64_t line_size = ne0 * ggml_element_size(tensor); + // int64_t line_size_align_32 = (line_size + 31) & ~31; + // size += (line_size_align_32 - line_size); + + // TODO: not support quantized yet. + // TODO: consider un-continue tensor. + if (ggml_is_quantized(tensor->type)) { + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size( + tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + } + + return size; + + GGML_UNUSED(buft); +} + +/** + * @brief Interface for managing CANN buffer types in the GGML backend. + * + * Provides function pointers for allocating, querying properties, and managing + * memory for CANN buffer types in the GGML backend. + */ +static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = { + /* .get_name = */ ggml_backend_cann_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_cann_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cann_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_cann_buffer_type_get_alloc_size, + /* .is_host = */ NULL, +}; + +/** + * @brief Retrieves the CANN buffer type for a specified device. + * + * This function initializes and returns the buffer type interface associated + * with the given device. It ensures thread-safe access using a mutex. + * + * @param device The device index for which to retrieve the buffer type. + * @return A pointer to the buffer type interface for the specified device, or + * nullptr if the device index is out of range. + */ +GGML_CALL ggml_backend_buffer_type_t +ggml_backend_cann_buffer_type(int32_t device) { + static std::mutex mutex; + std::lock_guard lock(mutex); + + if (device >= ggml_backend_cann_get_device_count()) { + return nullptr; + } + + static ggml_backend_buffer_type + ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES]; + + static bool ggml_backend_cann_buffer_type_initialized = false; + + if (!ggml_backend_cann_buffer_type_initialized) { + for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) { + ggml_backend_cann_buffer_types[i] = { + /* .iface = */ ggml_backend_cann_buffer_type_interface, + /* .context = */ + new ggml_backend_cann_buffer_type_context{ + i, "CANN" + std::to_string(i)}, + }; + } + ggml_backend_cann_buffer_type_initialized = true; + } + + return &ggml_backend_cann_buffer_types[device]; +} + +/** + * @brief Computes the forward operation for a given tensor using CANN + * operations. + * + * This function selects the appropriate CANN operation based on the type of + * operation specified in the tensor and performs the computation. + * + * @param ctx The CANN context containing necessary resources and + * configurations. + * @param dst The destination tensor where the result of the computation will be + * stored. + * @return true if the computation was successful; false otherwise. + */ +static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, + struct ggml_tensor* dst) { + switch (dst->op) { + case GGML_OP_REPEAT: + ggml_cann_repeat(ctx, dst); + break; + case GGML_OP_GET_ROWS: + ggml_cann_get_rows(ctx, dst); + break; + case GGML_OP_DUP: + ggml_cann_dup(ctx, dst); + break; + case GGML_OP_ADD: + ggml_cann_add(ctx, dst); + break; + case GGML_OP_ACC: + ggml_cann_acc(ctx, dst); + break; + case GGML_OP_MUL: + ggml_cann_mul_div(ctx, dst); + break; + case GGML_OP_DIV: + ggml_cann_mul_div(ctx, dst); + break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(dst)) { + case GGML_UNARY_OP_GELU: + ggml_cann_activation( + ctx, dst); + break; + case GGML_UNARY_OP_SILU: + ggml_cann_activation( + ctx, dst); + break; + // TODO: Use faster gelu?? + case GGML_UNARY_OP_GELU_QUICK: + ggml_cann_activation( + ctx, dst); + break; + case GGML_UNARY_OP_TANH: + ggml_cann_activation( + ctx, dst); + break; + case GGML_UNARY_OP_RELU: + ggml_cann_activation( + ctx, dst); + break; + case GGML_UNARY_OP_HARDSIGMOID: + ggml_cann_activation(ctx, dst); + break; + case GGML_UNARY_OP_HARDSWISH: + ggml_cann_activation(ctx, dst); + break; + default: + return false; + } + break; + case GGML_OP_NORM: + ggml_cann_norm(ctx, dst); + break; + case GGML_OP_GROUP_NORM: + ggml_cann_group_norm(ctx, dst); + break; + case GGML_OP_CONCAT: + ggml_cann_concat(ctx, dst); + break; + case GGML_OP_UPSCALE: + ggml_cann_upsample_nearest2d(ctx, dst); + break; + case GGML_OP_PAD: + ggml_cann_pad(ctx, dst); + break; + case GGML_OP_ARANGE: + ggml_cann_arange(ctx, dst); + break; + case GGML_OP_TIMESTEP_EMBEDDING: + ggml_cann_timestep_embedding(ctx, dst); + break; + case GGML_OP_LEAKY_RELU: + ggml_cann_leaky_relu(ctx, dst); + break; + case GGML_OP_RMS_NORM: + ggml_cann_rms_norm(ctx, dst); + break; + case GGML_OP_MUL_MAT: + ggml_cann_mul_mat(ctx, dst); + break; + case GGML_OP_MUL_MAT_ID: + return false; + case GGML_OP_SCALE: + ggml_cann_scale(ctx, dst); + break; + case GGML_OP_SQR: + ggml_cann_sqr(ctx, dst); + break; + case GGML_OP_CLAMP: + ggml_cann_clamp(ctx, dst); + break; + case GGML_OP_CPY: + ggml_cann_cpy(ctx, dst); + break; + case GGML_OP_CONT: + ggml_cann_dup(ctx, dst); + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + break; + case GGML_OP_DIAG_MASK_INF: + ggml_cann_diag_mask(ctx, dst, -INFINITY); + break; + case GGML_OP_SOFT_MAX: + ggml_cann_softmax(ctx, dst); + break; + case GGML_OP_ROPE: + ggml_cann_rope(ctx, dst); + break; + case GGML_OP_IM2COL: + ggml_cann_im2col(ctx, dst); + break; + case GGML_OP_POOL_2D: + ggml_cann_pool2d(ctx, dst); + break; + case GGML_OP_SUM_ROWS: + ggml_cann_sum_rows(ctx, dst); + break; + case GGML_OP_ARGSORT: + ggml_cann_argsort(ctx, dst); + break; + default: + return false; + } + + return true; +} + +// backend +/** + * @brief Retrieves the name associated with the CANN backend. + * + * This function returns the name assigned to the CANN backend, which is stored + * in the context of the provided backend structure. + * + * @param backend Pointer to the CANN backend structure. + * @return A pointer to a constant string representing the backend name. + */ +GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + + return cann_ctx->name.c_str(); +} + +/** + * @brief Frees resources associated with the CANN backend. + * + * This function releases resources associated with the CANN backend context + * and resets the device associated with the backend to its initial state. + * + * @param backend Pointer to the CANN backend structure to be freed. + */ +GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + ACL_CHECK(aclrtSynchronizeDevice()); + ACL_CHECK(aclrtResetDevice(cann_ctx->device)); + + // finalize when last backend freed. + if (cann_ctx->device == ggml_backend_cann_get_device_count() - 1) { + ACL_CHECK(aclFinalize()); + } + + delete cann_ctx; + delete backend; +} + +/** + * @brief Retrieves the default buffer type associated with the CANN backend. + * + * This function returns the buffer type specific to the device associated + * with the CANN backend. It is used to allocate buffers for computations + * performed by the backend. + * + * @param backend Pointer to the CANN backend structure. + * @return Pointer to the buffer type structure for the CANN backend. + */ +GGML_CALL static ggml_backend_buffer_type_t +ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + + return ggml_backend_cann_buffer_type(cann_ctx->device); +} + +/** + * @brief Sets tensor data asynchronously in the CANN backend. + * + * This function asynchronously sets tensor data in the CANN backend. Depending + * on the tensor type, it may perform data transformations before copying data + * to the device. + * + * @param backend Pointer to the CANN backend structure. + * @param tensor Pointer to the tensor structure to set data for. + * @param data Pointer to the host data to copy to the tensor. + * @param offset Offset in bytes within the host data. + * @param size Size of the data to copy in bytes. + */ +GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend, + ggml_tensor* tensor, + const void* data, + size_t offset, + size_t size) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + + if (!need_transform(tensor->type)) { + ACL_CHECK(aclrtMemcpyAsync( + tensor->data, size, (const char*)data + offset, size, + ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream())); + } else { + void* transform_buffer = malloc(size); + ggml_backend_cann_transform(tensor, (const char*)data + offset, + transform_buffer); + +#ifndef NDEBUG + void* check_buffer = malloc(size); + ggml_backend_cann_transform_back(tensor, transform_buffer, + check_buffer); + GGML_ASSERT(memcmp((const char*)data + offset, check_buffer, size)); + free(check_buffer); +#endif + ACL_CHECK(aclrtMemcpyAsync(tensor->data, size, transform_buffer, size, + ACL_MEMCPY_HOST_TO_DEVICE, + cann_ctx->stream())); + ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream())); + free(transform_buffer); + } +} + +GGML_CALL static void ggml_backend_cann_get_tensor_async( + ggml_backend_t backend, const ggml_tensor* tensor, void* data, + size_t offset, size_t size) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + ggml_backend_buffer_t buf = + tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + + GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && + "unsupported buffer type"); + + if (!need_transform(tensor->type)) { + ACL_CHECK(aclrtMemcpyAsync((char*)data + offset, size, tensor->data, + size, ACL_MEMCPY_DEVICE_TO_HOST, + cann_ctx->stream())); + } else { + void* transform_buffer = malloc(size); + ACL_CHECK(aclrtMemcpyAsync(transform_buffer, size, tensor->data, size, + ACL_MEMCPY_DEVICE_TO_HOST, + cann_ctx->stream())); + ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream())); + ggml_backend_cann_transform_back(tensor, transform_buffer, + (char*)data + offset); + free(transform_buffer); + } +} + +/** + * @brief Asynchronously copies tensor data between CANN backends. + * + * This function copies tensor data asynchronously between two CANN backends. It + * checks if both tensors reside in CANN buffers and whether the devices support + * peer-to-peer access for direct copying. If not, it returns false. + * + * @param backend_src Pointer to the source CANN backend structure. + * @param backend_dst Pointer to the destination CANN backend structure. + * @param src Pointer to the source tensor to copy data from. + * @param dst Pointer to the destination tensor to copy data to. + * @return true if the copy operation succeeds, false otherwise. + */ +GGML_CALL static bool ggml_backend_cann_cpy_tensor_async( + ggml_backend_t backend_src, ggml_backend_t backend_dst, + const ggml_tensor* src, ggml_tensor* dst) { + GGML_ASSERT(ggml_backend_is_cann(backend_src) || + ggml_backend_is_cann(backend_dst)); + + if (!ggml_backend_buffer_is_cann(src->buffer) || + !ggml_backend_buffer_is_cann(dst->buffer)) { + return false; + } + + ggml_backend_buffer_t buf_src = + src->view_src ? src->view_src->buffer : src->buffer; + ggml_backend_buffer_t buf_dst = + dst->view_src ? dst->view_src->buffer : dst->buffer; + + ggml_backend_cann_context* cann_ctx_src = + (ggml_backend_cann_context*)backend_src->context; + ggml_backend_cann_context* cann_ctx_dst = + (ggml_backend_cann_context*)backend_dst->context; + + size_t copy_size = ggml_nbytes(dst); + if (backend_src != backend_dst) { + ggml_backend_cann_buffer_context* buf_ctx_src = + (ggml_backend_cann_buffer_context*)buf_src->context; + ggml_backend_cann_buffer_context* buf_ctx_dst = + (ggml_backend_cann_buffer_context*)buf_dst->context; + + GGML_ASSERT(cann_ctx_src->device == buf_ctx_src->device); + GGML_ASSERT(cann_ctx_dst->device == buf_ctx_dst->device); + + int32_t canAccessPeer = 0; + ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device, + cann_ctx_dst->device)); + if (!canAccessPeer) { + return false; + } + + ggml_cann_set_device(cann_ctx_src->device); + ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0)); + ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, + ACL_MEMCPY_DEVICE_TO_DEVICE, + cann_ctx_dst->stream())); + + // record event on src stream + if (!cann_ctx_src->copy_event) { + ACL_CHECK(aclrtCreateEvent(&cann_ctx_src->copy_event)); + } + + ACL_CHECK( + aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream())); + + // wait on dst stream for the copy to complete + ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), + cann_ctx_src->copy_event)); + } else { + // src and dst are on the same backend + ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, + ACL_MEMCPY_DEVICE_TO_DEVICE, + cann_ctx_dst->stream())); + } + + return true; +} + +/** + * @brief Synchronizes a CANN backend. + * + * This function synchronizes the specified CANN backend by waiting for all + * operations in its associated stream to complete. + * + * @param backend Pointer to the CANN backend structure to synchronize. + */ +GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + + ggml_cann_set_device(cann_ctx->device); + + ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream())); +} + +/** + * @brief Computes a computational graph using a CANN backend. + * + * This function computes the operations defined in the computational graph + * using the specified CANN backend. + * + * @param backend Pointer to the CANN backend structure to use for computation. + * @param cgraph Pointer to the computational graph structure containing nodes + * representing operations to be computed. + * @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation + * completes successfully, otherwise an appropriate error status. + */ +GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute( + ggml_backend_t backend, ggml_cgraph* cgraph) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + + ggml_cann_set_device(cann_ctx->device); + + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor* node = cgraph->nodes[i]; + + if (ggml_is_empty(node) || node->op == GGML_OP_NONE) { + continue; + } + + bool ok = ggml_cann_compute_forward(*cann_ctx, node); + + if (!ok) { + GGML_CANN_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, + node->name, ggml_op_name(node->op)); + } + GGML_ASSERT(ok); + } + + return GGML_STATUS_SUCCESS; +} + +/** + * @brief Checks if the CANN backend supports a specific operation. + * + * This function checks whether the specified operation is supported by the + * CANN backend. + * + * @param backend Pointer to the CANN backend structure to check support for + * the operation. + * @param op Pointer to the tensor representing the operation to check. + * @return bool Returns true if the operation is supported by the backend, + * otherwise false. + */ +GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend, + const ggml_tensor* op) { + switch (op->op) { + case GGML_OP_UNARY: + switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_SILU: + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_HARDSIGMOID: + case GGML_UNARY_OP_HARDSWISH: + case GGML_UNARY_OP_GELU_QUICK: + case GGML_UNARY_OP_TANH: + return true; + default: + return false; + } + case GGML_OP_MUL_MAT: { + switch (op->src[0]->type) { + // case GGML_TYPE_Q4_0: + case GGML_TYPE_F16: + case GGML_TYPE_F32: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } + } + case GGML_OP_MUL_MAT_ID: + return false; + // embedding + case GGML_OP_GET_ROWS: { + switch (op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } + } break; + case GGML_OP_CPY: { + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } + } + case GGML_OP_DUP: + case GGML_OP_REPEAT: + case GGML_OP_CONCAT: + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_NORM: + case GGML_OP_ADD: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_RMS_NORM: + case GGML_OP_SCALE: + case GGML_OP_SQR: + case GGML_OP_CLAMP: + case GGML_OP_CONT: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_ROPE: + case GGML_OP_IM2COL: + case GGML_OP_POOL_2D: + case GGML_OP_SUM_ROWS: + case GGML_OP_ARGSORT: + case GGML_OP_ACC: + case GGML_OP_GROUP_NORM: + case GGML_OP_UPSCALE: + case GGML_OP_PAD: + case GGML_OP_ARANGE: + case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_LEAKY_RELU: + return true; + default: + return false; + } + + GGML_UNUSED(backend); +} + +/** + * @brief Checks if the backend buffer type is associated with the CANN backend. + * + * This function checks whether the provided backend buffer type is associated + * with the CANN backend based on the comparison of its name retrieval function + * pointer. + * + * @param buft Pointer to the backend buffer type to check. + * @return bool Returns true if the buffer type is associated with the CANN + * backend, otherwise false. + */ +static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_cann_buffer_type_name; +} + +/** + * @brief Checks if the CANN backend supports a specific backend buffer type. + * + * This function determines whether the CANN backend supports the given backend + * buffer type by comparing the device context of the backend and buffer type. + * It returns true if the device associated with the buffer type matches the + * device associated with the backend. + * + * @param backend Pointer to the CANN backend. + * @param buft Pointer to the backend buffer type to check. + * @return bool Returns true if the CANN backend supports the buffer type, + * otherwise false. + */ +GGML_CALL static bool ggml_backend_cann_supports_buft( + ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_cann_buffer_type_name; + + GGML_UNUSED(backend); +} + +/** + * @brief Determines if a tensor operation should be offloaded to the CANN + * backend. + * + * This function checks if a given tensor operation should be offloaded to the + * CANN backend based on the operation type and the size of the tensor. It + * returns true if the second dimension (ne[1]) of the tensor is greater than or + * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS. + * + * @param backend Pointer to the CANN backend. + * @param op Pointer to the tensor operation to check. + * @return bool Returns true if the operation should be offloaded, otherwise + * false. + */ +GGML_CALL static bool ggml_backend_cann_offload_op(ggml_backend_t backend, + const ggml_tensor* op) { + const int min_batch_size = 32; + GGML_UNUSED(backend); + + return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS; +} + +/** + * @brief Creates a new event for the CANN backend. + * + * This function initializes a new event for the CANN backend by setting the + * device and creating an ACL runtime event. The created event is then wrapped + * in a ggml_backend_event structure and returned. + * + * @param backend Pointer to the CANN backend. + * @return ggml_backend_event_t Returns a pointer to the new event structure. + */ +static ggml_backend_event_t ggml_backend_cann_event_new( + ggml_backend_t backend) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + + ggml_cann_set_device(cann_ctx->device); + + aclrtEvent event; + ACL_CHECK(aclrtCreateEvent(&event)); + + return new ggml_backend_event{ + /* .backend = */ backend, + /* .context = */ event, + }; +} + +/** + * @brief Frees a CANN backend event. + * + * This function destroys the ACL runtime event associated with the given CANN + * backend event and then deletes the event structure itself. + * + * @param event Pointer to the event structure to be freed. + */ +static void ggml_backend_cann_event_free(ggml_backend_event_t event) { + ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context)); + + delete event; +} + +/** + * @brief Records an event on the CANN backend stream. + * + * This function records the given event on the ACL runtime stream associated + * with the backend context. + * + * @param event Pointer to the event structure to be recorded. + */ +static void ggml_backend_cann_event_record(ggml_backend_event_t event) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)event->backend->context; + + ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream())); +} + +/** + * @brief Waits for a recorded event to complete on the CANN backend stream. + * + * This function makes the given backend wait for the event to complete on its + * ACL runtime stream. + * + * @param backend Pointer to the backend structure. + * @param event Pointer to the event structure that the backend needs to wait + * for. + */ +static void ggml_backend_cann_event_wait(ggml_backend_t backend, + ggml_backend_event_t event) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + + if (ggml_backend_is_cann(event->backend)) { + ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), + (aclrtEvent)event->context)); + } else { + GGML_ASSERT(false); + } +} + +/** + * @brief Synchronizes the given event on the CANN backend. + * + * This function waits for the specified event to complete on the ACL runtime. + * + * @param event Pointer to the event structure to be synchronized. + */ +static void ggml_backend_cann_event_synchronize(ggml_backend_event_t event) { + ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context)); +} + +/** + * @brief Structure defining the interface for the CANN backend. + * + * This structure contains function pointers for various operations + * supported by the CANN backend, including name retrieval, memory + * management, tensor operations, synchronization, and event handling. + */ +static ggml_backend_i ggml_backend_cann_interface = { + /* .get_name = */ ggml_backend_cann_name, + /* .free = */ ggml_backend_cann_free, + /* .get_default_buffer_type = */ ggml_backend_cann_get_default_buffer_type, + /* .set_tensor_async = */ ggml_backend_cann_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_cann_get_tensor_async, + /* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async, + /* .synchronize = */ ggml_backend_cann_synchronize, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_cann_graph_compute, + /* .supports_op = */ ggml_backend_cann_supports_op, + /* .supports_buft = */ ggml_backend_cann_supports_buft, + /* .offload_op = */ ggml_backend_cann_offload_op, + /* .event_new = */ ggml_backend_cann_event_new, + /* .event_free = */ ggml_backend_cann_event_free, + /* .event_record = */ ggml_backend_cann_event_record, + /* .event_wait = */ ggml_backend_cann_event_wait, + /* .event_synchronize = */ ggml_backend_cann_event_synchronize, +}; + +/** + * @brief Return the hardcoded GUID for the CANN backend. + * + * This function returns a static GUID which uniquely identifies the CANN + * backend. + * + * @return A pointer to the static GUID. + */ +static ggml_guid_t ggml_backend_cann_guid() { + static ggml_guid guid = {0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34, + 0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64}; + return &guid; +} + +GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) { + aclInit(nullptr); + if (device < 0 || device >= ggml_backend_cann_get_device_count()) { + GGML_CANN_LOG_ERROR("%s: error: invalid device %d\n", __func__, device); + return nullptr; + } + + ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device); + if (ctx == nullptr) { + GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__); + return nullptr; + } + + ggml_backend_t cann_backend = + new ggml_backend{/* .guid = */ ggml_backend_cann_guid(), + /* .interface = */ ggml_backend_cann_interface, + /* .context = */ ctx}; + + return cann_backend; +} + +GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend) { + return backend != NULL && + ggml_guid_matches(backend->guid, ggml_backend_cann_guid()); +} + +GGML_CALL int32_t ggml_backend_cann_get_device_count() { + return ggml_cann_info().device_count; +} + +GGML_CALL void ggml_backend_cann_get_device_description( + int32_t device, char* description, size_t description_size) { + ggml_cann_set_device(device); + const char* soc_name = aclrtGetSocName(); + snprintf(description, description_size, "%s", soc_name); +} + +GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device, size_t* free, + size_t* total) { + ggml_cann_set_device(device); + ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total)); +} + +// backend registry +/** + * @brief Initializes a CANN backend based on the provided parameters. + * + * This function initializes a CANN backend using the device index and then + * initializes the backend using `ggml_backend_cann_init`. + * + * @param params Parameters for initialization (unused in this implementation). + * @param user_data User data containing the device index to initialize the + * backend. + * @return ggml_backend_t The initialized CANN backend. + */ +GGML_CALL static ggml_backend_t ggml_backend_reg_cann_init(const char* params, + void* user_data) { + ggml_backend_t cann_backend = + ggml_backend_cann_init((int)(intptr_t)user_data); + return cann_backend; + + GGML_UNUSED(params); +} + +extern "C" GGML_CALL int ggml_backend_cann_reg_devices(); + +/** + * @brief Registers CANN (Ascend) devices as backend options. + * + * This function initializes ACL, retrieves the number of available CANN + * devices, and registers each device as a backend option using + * `ggml_backend_register`. Each device is given a unique name based on + * `GGML_CANN_NAME` followed by its index. + * + * @return int The number of CANN devices registered. + */ +GGML_CALL int ggml_backend_cann_reg_devices() { + uint32_t device_count = ggml_backend_cann_get_device_count(); + // initialization + for (uint32_t i = 0; i < device_count; i++) { + char name[128]; + snprintf(name, sizeof(name), "CANN%d", i); + ggml_backend_register(name, ggml_backend_reg_cann_init, + ggml_backend_cann_buffer_type(i), + (void*)(intptr_t)i); + } + return device_count; +} diff --git a/ggml/src/ggml-cann/.clang-format b/ggml/src/ggml-cann/.clang-format new file mode 100644 index 000000000..2ad03d739 --- /dev/null +++ b/ggml/src/ggml-cann/.clang-format @@ -0,0 +1,168 @@ +--- +Language: Cpp +# BasedOnStyle: Google +AccessModifierOffset: -1 +AlignAfterOpenBracket: Align +AlignConsecutiveMacros: false +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Left +AlignOperands: true +AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortLambdasOnASingleLine: All +AllowShortIfStatementsOnASingleLine: WithoutElse +AllowShortLoopsOnASingleLine: true +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: Yes +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeInheritanceComma: false +BreakInheritanceList: BeforeColon +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: true +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DeriveLineEnding: true +DerivePointerAlignment: true +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^' + Priority: 2 + SortPriority: 0 + - Regex: '^<.*\.h>' + Priority: 1 + SortPriority: 0 + - Regex: '^<.*' + Priority: 2 + SortPriority: 0 + - Regex: '.*' + Priority: 3 + SortPriority: 0 +IncludeIsMainRegex: '([-_](test|unittest))?$' +IncludeIsMainSourceRegex: '' +IndentCaseLabels: true +IndentGotoLabels: true +IndentPPDirectives: None +IndentWidth: 4 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Never +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Left +RawStringFormats: + - Language: Cpp + Delimiters: + - cc + - CC + - cpp + - Cpp + - CPP + - 'c++' + - 'C++' + CanonicalDelimiter: '' + BasedOnStyle: google + - Language: TextProto + Delimiters: + - pb + - PB + - proto + - PROTO + EnclosingFunctions: + - EqualsProto + - EquivToProto + - PARSE_PARTIAL_TEXT_PROTO + - PARSE_TEST_PROTO + - PARSE_TEXT_PROTO + - ParseTextOrDie + - ParseTextProtoOrDie + CanonicalDelimiter: '' + BasedOnStyle: google +ReflowComments: true +SortIncludes: true +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: false +SpacesInConditionalStatement: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceBeforeSquareBrackets: false +Standard: Auto +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseCRLF: false +UseTab: Never +... + diff --git a/ggml/src/ggml-cann/Doxyfile b/ggml/src/ggml-cann/Doxyfile new file mode 100644 index 000000000..2b009e8f9 --- /dev/null +++ b/ggml/src/ggml-cann/Doxyfile @@ -0,0 +1,2579 @@ +# Doxyfile 1.8.17 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the configuration +# file that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = "llama.cpp" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = "llama inference engine" + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = docs + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all generated output in the proper direction. +# Possible values are: None, LTR, RTL and Context. +# The default value is: None. + +OUTPUT_TEXT_DIRECTION = None + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = NO + +# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line +# such as +# /*************** +# as being the beginning of a Javadoc-style comment "banner". If set to NO, the +# Javadoc-style will behave just like regular comments and it will not be +# interpreted by doxygen. +# The default value is: NO. + +JAVADOC_BANNER = NO + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines (in the resulting output). You can put ^^ in the value part of an +# alias to insert a newline as if a physical newline was in the original file. +# When you need a literal { or } or , in the value part of an alias you have to +# escape them by means of a backslash (\), this can lead to conflicts with the +# commands \{ and \} for these it is advised to use the version @{ and @} or use +# a double escape (\\{ and \\}) + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice +# sources only. Doxygen will then generate output that is more tailored for that +# language. For instance, namespaces will be presented as modules, types will be +# separated into more groups, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_SLICE = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, JavaScript, +# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, +# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: +# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser +# tries to guess whether the code is fixed or free formatted code, this is the +# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat +# .inc files as Fortran files (default is PHP), and .f files as C (default is +# Fortran), use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See https://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up +# to that level are automatically included in the table of contents, even if +# they do not have an id attribute. +# Note: This feature currently applies only to Markdown headings. +# Minimum value: 0, maximum value: 99, default value: 5. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +TOC_INCLUDE_HEADINGS = 5 + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = NO + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = NO + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = YES + +# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual +# methods of a class will be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIV_VIRTUAL = YES + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = YES + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = YES + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = YES + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# declarations. If set to NO, these declarations will be included in the +# documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# (including Cygwin) ands Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = YES + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. If +# EXTRACT_ALL is set to YES then this flag will automatically be disabled. +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. +# The default value is: NO. + +WARN_AS_ERROR = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT = + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: https://www.gnu.org/software/libiconv/) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment), +# *.doc (to be provided as doxygen C comment), *.txt (to be provided as doxygen +# C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f, *.for, *.tcl, *.vhd, +# *.vhdl, *.ucf, *.qsf and *.ice. + +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.idl \ + *.ddl \ + *.odl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.cs \ + *.d \ + *.php \ + *.php4 \ + *.php5 \ + *.phtml \ + *.inc \ + *.m \ + *.markdown \ + *.md \ + *.mm \ + *.dox \ + *.doc \ + *.txt \ + *.py \ + *.pyw \ + *.f90 \ + *.f95 \ + *.f03 \ + *.f08 \ + *.f \ + *.for \ + *.tcl \ + *.vhd \ + *.vhdl \ + *.ucf \ + *.qsf \ + *.ice + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = * + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# entity all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see https://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the +# cost of reduced performance. This can be particularly helpful with template +# rich C++ code for which doxygen's built-in parser lacks the necessary type +# information. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse_libclang=ON option for CMake. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = + +# If clang assisted parsing is enabled you can provide the clang parser with the +# path to the compilation database (see: +# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files +# were built. This is equivalent to specifying the "-p" option to a clang tool, +# such as clang-check. These options will then be passed to the parser. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse_libclang=ON option for CMake. + +CLANG_DATABASE_PATH = + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# https://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML +# documentation will contain a main index with vertical navigation menus that +# are dynamically created via JavaScript. If disabled, the navigation index will +# consists of multiple levels of tabs that are statically embedded in every HTML +# page. Disable this option to support browsers that do not have JavaScript, +# like the Qt help browser. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_MENUS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: https://developer.apple.com/xcode/), introduced with OSX +# 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy +# genXcode/_index.html for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the master .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual- +# folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 4 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANSPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands +# to create new LaTeX commands to be used in formulas as building blocks. See +# the section "Including formulas" for details. + +FORMULA_MACROFILE = + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# https://www.mathjax.org) which uses client side JavaScript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = YES + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from https://www.mathjax.org before deployment. +# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/ + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /